Results 1 to 1 of 1

Thread: Parsing webpages and adding Product Microdata.

  1. #1

    Parsing webpages and adding Product Microdata.

    So this SEO thing is kinda keeping me going and I've been steadily reading about how to direct traffic flow from 1 area of the internet to the area I desire it to be. After reading about how including microdata can increase Click through rates by almost a third I was intrigued and decided to implement a parser to update the 150k+ products on my target site automatically. So I started by analyzing the source of the page and reading up on mono develop the .net equivalent for Linux. After sometime reading the source I noticed repeating structures that could be used to my advantage in including the required elements. Tables of items are a great because the overall layout is repeated for each item like so.

    Code:
    <table>
    <tr>
    <td>
    item specific data
    </td>
    <td>
    another items data
    </td>
    ...
    </tr>
    </table>
    So this was great from a programmers aspect but all the tables are variable length and include tons of markup not really related to the item but to the target site. I overcame this by simple use of layout recognition and text modification. Included below is the source to my endeavor and would encourage all people in this field to modify and use my code.

    Code:
    public virtual void AddMicroDataToTableOfProducts(object sender, EventArgs args)
    		{
    			/*Declare Variables*/
    			//BaseUri is the Base of all webpages scraped by this tool
    			String BaseUri = "http://www.target-site.com/subcategory";
    			//Products declares a array of html pages to scrape.
    			String[] Products = { 
    				"/aprons/186.html",
    				"/boots/457.html",
    				"/chef-hats/452.html",
    				"/chef-coats-and-jackets/453.html",
    				"/chef-pants/454.html",
    				"/gloves-disposable/214.html",
    				"/hairnets/22.html",
    				"/oven-mitts-and-pot-holders/98.html",
    				"/protective-wear/456.html",
    				"/safety-wear/234.html",
    				"/shirts/450.html",
    				"/shoes-and-clogs/451.html",
    				"/waitstaff-attire/458.html",
    				"/bar-and-drink-mixes/11.html",
    				"/coffee/12.html",
    				"/dairy/14.html",
    				"/juices/13.html",
    				"/smoothies/15.html",
    				"/soda/16.html",
    				"/soy-milk/18.html",
    				"/syrups/19.html",
    				"/teas/17.html",
    				"/water/20.html",
    				"/bags/449.html",
    				"/bakery-and-deli-tissue/24.html",
    				"/baking-cups/283.html",
    				"/bathroom-tissue/25.html",
    				"/bibs/26.html",
    				"/bowls-disposable/27.html",
    				"/boxes-and-circles/28.html",
    				"/coasters/29.html",
    				"/containers-and-lids/30.html",
    				"/cups-disposable/31.html",
    				"/doilies/32.html",
    				"/facial-tissue/33.html",
    				"/filters/209.html",
    				"/foils-and-plastic-wrap/35.html",
    				"/food-trays/36.html",
    				"/freezer-paper/37.html",
    				"/gloves-disposable/214.html",
    				"/green-for-disposable-items/249.html",
    				"/guest-checks/39.html",
    				"//hairnets/22.html",
    				"/kraft-paper/40.html"	
    			};
    			//Product Titles.
    			String[] ProductTitles = {
    				"Cooking & Food Preparation aprons",
    				"Restaurant Footwear",
    				"Chef Hats",
    				"Chef Coats & Jackets",
    				"Chef Pants",
    				"Disposable Gloves",
    				"Hairnets",
    				"Oven mitts & pot holders",
    				"Protective wear",
    				"Safety wear",
    				"Shirts",
    				"Shoes & Clogs",
    				"Waitstaff Attire",
    				"Bar and Drink mixes",
    				"Coffee",
    				"Dairy Products",
    				"Juices",
    				"Smoothies",
    				"Soda",
    				"Soy Milk",
    				"Syrups",
    				"Teas",
    				"Water",
    				"Bags",
    				"Bakery and Deli Tissue",
    				"Baking Cups",
    				"Bathroom Tissue",
    				"Bibs",
    				"Disposable Bowls",
    				"Packaging and Boxes",
    				"Coasters",
    				"Containers with Lids",
    				"Disposable Cups",
    				"Doilies",
    				"Facial Tissues",
    				"Filters",
    				"Foils and Plastic Wrap",
    				"Food Trays",
    				"Freezer Paper",
    				"Disposable Gloves",
    				"Disposable renewable items",
    				"Guest Checks",
    				"Hairnets & Hats",
    				"Craft Paper"	
    			};
    			 
    			int[] ProductPageId = { 186,457,452,453,454,214,22,98,456,234,450,
    			451,458,11,12,14,13,15,16,18,19,17,20,429,24,283,25,26,27,28,29,30,31,
    			32,33,209,35,36,37,214,249,39,22,40			
    			};
    			String SearchString = "?searchsubcategory=";
    			String Pagesize = "&pagesize=1000000";
    			/*start code*/
    			using(WebClient wc = new WebClient())
    			{
    				//http://www.target-site.com/subcategory + Products[pageid] + SearchString + PageSize			
    				
    				//start from 0 and go up to the number of products in the array above
    				for( int ProductId = 0; ProductId < Products.Length;ProductId++)
    				{
    					//Get the name of the product page we are going to work with
    					String Element = Products[ProductId];
    					//Form the Universal Resource Indicator string by concating the strings together
    					Uri UriSearchString = new Uri( BaseUri + Element + SearchString + ProductPageId[ProductId] + Pagesize);	
    					//use the web client api DownloadString to post a HTTP GET to the resulting URL
    					String Html = wc.DownloadString(UriSearchString);
    					
    					int StartTag = 0;
    					if(Html.Length != 0)
    					{
    						//Find the index of the first table by searching for the start of the string that begins with 
    						//<table the open table tag.
    						int TableStart = Html.IndexOf("<table");
    						//Find the Index of the end of the table by searching for </table the close table tag.
    						int TableEnd = Html.IndexOf("</table", TableStart);
    						//Use the index of the End of the table to find the beginning of the next table.
    						TableStart = Html.IndexOf("<table",TableEnd);
    						//use the index of start to find the end of the table.
    						TableEnd = Html.IndexOf("</table",TableStart);
    						//Get total table length by minusing the end from the beginning and adding the size
    						//of the </table> tag which is 8.
    						int TotalStringLength = TableEnd - TableStart + 8;
    						//Use the Total string length to copy the table to the Html String instead of the whole website.
    						Html = Html.Substring(TableStart,TotalStringLength);
    						//Replace images and links with the full path so it is visible when 
    						//posting to another site.
    						Html = Html.Replace("/products","http://www.target-site.com/products");
    						Html = Html.Replace("/_imageresize","http://www.target-site.com/_imageresize");
    						Html = Html.Replace("/_resources","http://www.target-site.com/_resources");
    						TableStart = Html.IndexOf("<table");
    						if(TableStart == -1)
    						{
    							break;
    						}
    						//Insert Table Title
    						Html = Html.Insert(TableStart, ProductTitles[ProductId]);
    						TableEnd = Html.IndexOf(">",TableStart);
    						if(TableEnd == -1)
    						{
    							break;
    						}
    						//modify the table by adding the border attribute
    						Html = Html.Insert(TableEnd," border=\"2\"");
    						TableStart = Html.IndexOf("<tr",TableStart);
    						if(TableStart == -1)
    						{
    							break;
    						}
    						TableEnd = Html.IndexOf("</table>",TableStart);
    						TotalStringLength = TableEnd - TableStart + 5;
    						do
    						{
    							//locate first div and update with schema information.
    							StartTag = Html.IndexOf("<div",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							/*
    							* Insert a Div tag to wrap up the Product information.
    							* This is done to avoid any modification to a4r and 
    							* act more of a additive to a4r.
    							*/
    							Html = Html.Insert(StartTag,"<div itemscope itemtype=\"http://schema.org/Product\">"); 
    							TotalStringLength += 52;
    							//Locate the first link tag
    							StartTag = Html.IndexOf("<a",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							Html = Html.Insert(StartTag+2,"itemprop=\"url\"");
    							//insert the item property that describes the link as the link to the product.			Html = Html.Insert(StartTag+2," itemprop=\"url\"");
    							TotalStringLength += 14;
    							//Locate the first image tag.
    							StartTag = Html.IndexOf("<img",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							//Insert item property that marks the Product image as the Image of the product.
    							Html = Html.Insert(StartTag+4," itemprop=\"image\"");
    							TotalStringLength += 17;
    							StartTag = Html.IndexOf("</div>",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							StartTag = Html.IndexOf("<a href=", StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							StartTag = Html.IndexOf("<a href=", StartTag+8);
    							StartTag = Html.IndexOf("<div>");
    							if(StartTag == -1)
    							{
    								break;
    							}
    							//Insert the property that describes Product name.
    							Html = Html.Insert(StartTag+4," itemprop=\"name\"");
    							TotalStringLength +=16;
    							StartTag = Html.IndexOf("<div style=",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							//Insert the microdata that describes the manufacturer
    							Html = Html.Insert(StartTag+4," itemprop=\"manufacturer\" itemscope itemtype=\"http://schema.org/Organization\" ");
    							TotalStringLength += 77;
    							StartTag = Html.IndexOf(">",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							//insert the Html that marks the manufacturers name.
    							Html = Html.Insert(StartTag+1,"<span itemprop=\"name\">");
    							TotalStringLength += 22;
    							StartTag = Html.IndexOf("</div>",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							//Close up Manufacturer Name
    							Html = Html.Insert(StartTag,"</span>");
    							TotalStringLength += 7;
    							StartTag = Html.IndexOf("<div class=\"ProductBidPrice\">",StartTag);
    							if(StartTag == -1)
    							{
    								break;
    							}
    							//Close off the Microdata div wrapper.
    							Html = Html.Insert(StartTag+29,"</div>");
    							TotalStringLength += 6;
    							StartTag = Html.IndexOf("</td>",StartTag);
    						}while(StartTag < TotalStringLength);
    						System.IO.TextWriter writeHtml = new StreamWriter("/home/****/Desktop/a4r/"+ProductTitles[ProductId]+".html");
    						writeHtml.Write(Html.ToString());
    						writeHtml.Flush();
    						writeHtml.Close();										
    					}
    				}
    			}
    		}			
    	}
    }
    regards
    Last edited by OfMonsterAndMen; January 28th, 2013 at 17:43.

Similar Threads

  1. Replies: 3
    Last Post: April 19th, 2012, 05:37
  2. Product recomendation - which native exe packer?
    By _xhp_ in forum Advanced Reversing and Programming
    Replies: 1
    Last Post: October 26th, 2011, 19:57
  3. Product Activation
    By NoLoader in forum The Newbie Forum
    Replies: 22
    Last Post: September 22nd, 2007, 01:34
  4. Microsoft MSO.DLL (Product Activation)
    By Nad_Af in forum Malware Analysis and Unpacking Forum
    Replies: 17
    Last Post: June 3rd, 2004, 09:58
  5. Inside Windows Product Activation (WPA)
    By Kayaker in forum RCE Cryptographics
    Replies: 6
    Last Post: July 16th, 2002, 07:45

Bookmarks

Posting Permissions

  • You may not post new threads
  • You may not post replies
  • You may not post attachments
  • You may not edit your posts
  •