Our main source of books is Ingram (which is one of the largest book wholesalers in the USA). Since they have access to several hundred of thousands books and they offer drop-shipping capabilities, it makes sense for the Paradoxal Press Online Store to take advantage of this source for our online business. Although they do offer title databases (complete with cover images, ...), it is a subscription service which is quite expensive and out of reach for us in the short term. We do also keep some of the most popular books in-stock but out web store being a small business, it makes more sense to drop-ship most orders.

There are plenty of free sources of book information out there, but our main concern is determining what books are in-stock and what is our discount on such tile so that the information on our site is up to date. Ingram does offer a web interface which allows me to place orders and get such information. But of course, doing this by hand for 200,000 titles is simply out of the question.

The answer? Page scraping. By automatically browising the page for each title and extrating the relevant information through the use of regular expressions, I can easily get all the information I need.

Below is an example of how I take information from Ingram's stock page information and extract the relevant bits for my use.

   1:          private Regex onHand = new Regex("On Hand\\</SPAN\\>.*?\\<TD.*?\\>(?<oh1>.*?)\\</TD\\>.*?\\<TD.*?\\>(?<oh2>.*?)\\</TD\\>.*?\\<TD.*?\\>(?<oh3>.*?)\\</TD\\>.*?\\<TD.*?\\>(?<oh4>.*?)\\</TD\\>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant);
   2:          private Regex onOrder = new Regex("On Order\\</SPAN\\>.*?\\<TD.*?\\>(?<oh1>.*?)\\</TD\\>.*?\\<TD.*?\\>(?<oh2>.*?)\\</TD\\>.*?\\<TD.*?\\>(?<oh3>.*?)\\</TD\\>.*?\\<TD.*?\\>(?<oh4>.*?)\\</TD\\>", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant);
   3:          private Regex discount = new Regex("\\(Discount\\: (?<disc>(REG)|(.*?\\%))\\)", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant);
   4:          private Regex srp = new Regex("US SRP\\:.*?(?<srp>\\d+?\\.\\d+?)[^\\d]", RegexOptions.IgnoreCase | RegexOptions.Singleline | RegexOptions.CultureInvariant);
   5:   
   6:          private void webBrowser1_DocumentCompleted(object sender, WebBrowserDocumentCompletedEventArgs e)
   7:          {
   8:              if (curAmazonItem != null)
   9:              {
  10:                  if (e.Url.AbsoluteUri.Contains("ipage.ingrambook.com"))
  11:                  {
  12:                      try
  13:                      {
  14:                          string ipageData = webBrowser1.DocumentText;
  15:   
  16:                          curAmazonItem.IPageStock = "Not in stock!";
  17:   
  18:                          int OnHand = 0;
  19:                          Match onHandMatch = onHand.Match(ipageData);
  20:                          if (onHandMatch.Success)
  21:                          {
  22:                              OnHand += int.Parse(onHandMatch.Groups["oh1"].Value);
  23:                              OnHand += int.Parse(onHandMatch.Groups["oh2"].Value);
  24:                              OnHand += int.Parse(onHandMatch.Groups["oh3"].Value);
  25:                              OnHand += int.Parse(onHandMatch.Groups["oh4"].Value);
  26:                          }
  27:                          else goto next;
  28:   
  29:                          int OnOrder = 0;
  30:                          Match onOrderMatch = onOrder.Match(ipageData);
  31:                          if (onOrderMatch.Success)
  32:                          {
  33:                              OnOrder += int.Parse(onOrderMatch.Groups["oh1"].Value);
  34:                              OnOrder += int.Parse(onOrderMatch.Groups["oh2"].Value);
  35:                              OnOrder += int.Parse(onOrderMatch.Groups["oh3"].Value);
  36:                              OnOrder += int.Parse(onOrderMatch.Groups["oh4"].Value);
  37:                          }
  38:                          else goto next;
  39:   
  40:                          if (OnHand > 5)
  41:                              curAmazonItem.IPageStock = "Usually ships in 24-48 hours (" + OnHand.ToString() + " in stock)";
  42:                          else
  43:                          {
  44:                              if (OnOrder > 10)
  45:                                  curAmazonItem.IPageStock = "Usually ships in 1-2 weeks";
  46:                              else
  47:                                  curAmazonItem.IPageStock = "Usually ships in 4-6 weeks";
  48:                          }
  49:                      next:
  50:   
  51:                          Match srpMatch = srp.Match(ipageData);
  52:                          if (srpMatch.Success)
  53:                          {
  54:                              curAmazonItem.ListPrice = "$" + srpMatch.Groups["srp"].Value;
  55:                          }
  56:   
  57:                          float discountPct = 1.0f;
  58:                          Match discountMatch = discount.Match(ipageData);
  59:                          if (discountMatch.Success)
  60:                          {
  61:                              switch (discountMatch.Groups["disc"].Value)
  62:                              {
  63:                                  case "REG":
  64:                                      discountPct = 0.65f;
  65:                                      break;
  66:                                  case "5%":
  67:                                      discountPct = 0.95f;
  68:                                      break;
  69:                                  case "10%":
  70:                                      discountPct = 0.90f;
  71:                                      break;
  72:                                  case "15%":
  73:                                      discountPct = 0.85f;
  74:                                      break;
  75:                                  case "20%":
  76:                                      discountPct = 0.80f;
  77:                                      break;
  78:                                  case "25%":
  79:                                      discountPct = 0.75f;
  80:                                      break;
  81:                                  case "30%":
  82:                                      discountPct = 0.70f;
  83:                                      break;
  84:                                  case "35%":
  85:                                      discountPct = 0.65f;
  86:                                      break;
  87:                              }
  88:                          }
  89:                          float price = float.Parse(curAmazonItem.ListPrice.Replace("$", ""));
  90:                          curAmazonItem.IPagePrice = (float)System.Math.Round(price * discountPct, 2);
  91:   
  92:                          Stream file = File.Open(curDataFile, FileMode.Create);
  93:                          IFormatter formatter = (IFormatter)new BinaryFormatter();
  94:                          formatter.Serialize(file, curAmazonItem);
  95:                          file.Close();
  96:                          AddStatus("Ingram Done = " + curAmazonItem.IPageStock + " - " + curAmazonItem.IPagePrice.ToString());
  97:   
  98:                      }
  99:                      catch
 100:                      {
 101:                          WorkItem workItem = new WorkItem();
 102:                          workItem.Type = WorkItem.WorkType.QueryIngram;
 103:                          workItem.Data = curDataFile;
 104:                          workList.Enqueue(workItem);
 105:                          AddStatus(" Failed = " + curDataFile);
 106:                      }
 107:                  }
 108:                  else
 109:                  {
 110:                      WorkItem workItem = new WorkItem();
 111:                      workItem.Type = WorkItem.WorkType.QueryIngram;
 112:                      workItem.Data = curDataFile;
 113:                      workList.Enqueue(workItem);
 114:                      AddStatus(" Failed = " + curDataFile);
 115:                  }
 116:              }
 117:          }

Now, keep in mind this is taken straight from my software, and since it's for internal use, it is not the cleanest code... You may also notice that the code resides within a webBrowser1_DocumentCompleted event. The main reson behind this is that since the Ingram webpage is a secure site which requires a log-in and a session, I create a visual WebBrowser control to navigate the webpages. Anyone know is there is a better solution to this?

The core of this code is a set of regular expressions which takes the content of the page and extracts the relevant information such as the stock quantities and wholesale discount rates. This information is then percolated into a class used to contain all the relevant information about a product.