Web scraping with html agility pack

0 comments

Given a well formatted bit of HTML, using HtmlAgilityPack; can let you change the DOM on the server side.


Note the ".//" is important in 

fairRows = resultRow.SelectNodes(".//div


This will only find things in resultRow, rather than the entire dom tree ("//div")


Also use starts-with(@class,  and contains(@class, 


			var div = html.DocumentNode.SelectSingleNode("//div[@id='passsearch']");
			if (div != null) {
				div.ParentNode.RemoveChild(div, false);
			}
			//-		cityTD.ChildNodes.FindFirst("span").Descendants("span")	{HtmlAgilityPack.HtmlNode.Descendants}	System.Collections.Generic.IEnumerable<HtmlAgilityPack.HtmlNode> {HtmlAgilityPack.HtmlNode.Descendants}
				var divs = html.DocumentNode.SelectNodes("//tr[starts-with(@class, 'solutionRow')]");
				if (divs != null) {
					int resultCount = 0;
					foreach (var resultRow in divs) {
						//find td.timeCity ...span.CityRow ... span.labelCityRow
						var cityTDs = resultRow.SelectNodes(".//td[contains(@class, 'timeCity')]");
						if (cityTDs != null) {
							var citiesFound = new StringList();
							foreach (var cityTD in cityTDs) {
								foreach (var item in cityTD.ChildNodes.Descendants()) {
									if (item.OuterHtml.Contains("labelCityRow")) {
										citiesFound.Add(item.OuterHtml);
									}
								}
							}
							//Web.Response.Write("<hr>" + citiesFound.ToString(", "));
							if ((citiesFound.First().ContainsInsensitive("new york") || citiesFound.Last().ContainsInsensitive("new york")) && (citiesFound.First().ContainsInsensitive("washington") || citiesFound.Last().ContainsInsensitive("washington"))) {
								//remove faires radios from div.fare-solution-wrp
								var fairRows = resultRow.SelectNodes(".//div[contains(@class, 'fare-solution-wrp')]");
								foreach (var htmlNode in fairRows) {
										htmlNode.InnerHtml = "N/A <span class=\"special-exclusion\" title=\"Not available for sale\">*</span>";
								}
								
							}
						}
						resultCount++;
					}
				}
			}

Comments


Leave a Comment