#if USE_HTML_PARSER_SGML //convert html to XML document public XmlDocument htmlToXmlDoc(string html) { // setup SgmlReader SgmlReader sgmlReader = new SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; string decodedHtml = HttpUtility.HtmlDecode(html); sgmlReader.InputStream = new StringReader(decodedHtml); // create document XmlDocument xmlDoc = new XmlDocument(); xmlDoc.PreserveWhitespace = true; xmlDoc.XmlResolver = null; xmlDoc.Load(sgmlReader); return xmlDoc; } #endif
例 13.1. htmlToXmlDoc 的使用范例
//(1) with xmlns string withXmlnsUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2"; string withXmlnsHtml = getUrlRespHtml(withXmlnsUrl); XmlDocument xmlDocWithNs = htmlToXmlDoc(withXmlnsHtml);
另外,贴出,完整的示例代码:
//example code for html parse void _demoHtmlParse() { #if USE_HTML_PARSER_SGML //Method 1: use htmlToXmlDoc //(1) with xmlns string withXmlnsUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2"; string withXmlnsHtml = getUrlRespHtml(withXmlnsUrl); XmlDocument xmlDocWithNs = htmlToXmlDoc(withXmlnsHtml); //<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> //<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" > // <head> // ... XmlNamespaceManager m = new XmlNamespaceManager(xmlDocWithNs.NameTable); m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml"); XmlNode titleNode = xmlDocWithNs.SelectSingleNode("//w3org:h1[@itemprop='name']", m); string title = titleNode.InnerText; //(2) without xmlns string withoutXmlnsUrl = "http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_nav_0"; //<!DOCTYPE html> //<html> //<head> //... string withoutXmlnsHtml = getUrlRespHtml(withoutXmlnsUrl); XmlDocument xmlDocNoNs = htmlToXmlDoc(withoutXmlnsHtml); XmlNodeList pageNodeList = xmlDocNoNs.SelectNodes("//ol[@class='zg_pagination']/li[@class]"); #endif //common part //how to use Attributes //XmlNodeList pageNodeList = xmlDoc.SelectNodes("//ol[@class='zg_pagination']/li[@class]"); //if (pageNodeList != null) //{ // for (int pageIdx = 1; pageIdx < pageNodeList.Count; pageIdx++) // { // XmlNode curPageNode = pageNodeList[pageIdx]; // //<li class="zg_page " id="zg_page2"><a page="2" ajaxUrl="http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_appliances_pg_2/191-0874592-3518518?ie=UTF8&pg=2&ajax=1" href="http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_appliances_pg_2/191-0874592-3518518?ie=UTF8&pg=2">21-40</a></li> // XmlNode ajaxUrlNode = curPageNode.SelectSingleNode(".//a[@href]"); // string pageUrl = ajaxUrlNode.Attributes["href"].Value; // } //} #if USE_HTML_PARSER_HTMLAGILITYPACK //Method 2: use htmlToHtmlDoc string testUrlWithXmlns = "http://sd.csdn.net/"; string respHtml = getUrlRespHtml(testUrlWithXmlns); //<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> //<html xmlns="http://www.w3.org/1999/xhtml"> //<head> HtmlAgilityPack.HtmlDocument htmlDoc = htmlToHtmlDoc(respHtml); //<div class="tabcontent" id="sc1"> // <ul> // <li><a href="http://www.csdn.net/article/tag/%E4%BA%A7%E5%93%81" target="_blank">产品</a></li> // ...... // <li><a href="http://www.csdn.net/article/tag/%E8%AE%BE%E8%AE%A1" target="_blank">设计</a></li> // </ul> //</div> //... //<div class="tabcontent" id="sc4"> // <ul> // ... // <li><a href="http://www.csdn.net/article/tag/%E6%95%B0%E6%8D%AE%E5%BA%93" target="_blank">数据库</a></li> // </ul> //</div> //here, no need to take care the html xmlns //is better than SGMLReader HtmlNode rootHtmlNode = htmlDoc.DocumentNode; HtmlNodeCollection htmlNodes = rootHtmlNode.SelectNodes("//div[@class='tabcontent']"); foreach (HtmlNode link in htmlNodes) { HtmlAttribute att = link.Attributes["id"]; string idHref = att.Value; }