第 13 章 crifanLib.cs之Html Parse

目录

13.1. 将HTML转换为XmlDocument:htmlToXmlDoc
13.2. 将HTML转换为HtmlAgilityPack的HtmlDocument:htmlToHtmlDoc
13.3. 去除HtmlNode中的子节点:removeSubHtmlNode
13.4. 去除HTML的标签tag:htmlRemoveTag

13.1. 将HTML转换为XmlDocument:htmlToXmlDoc


    #if USE_HTML_PARSER_SGML
    //convert html to XML document
    public XmlDocument htmlToXmlDoc(string html)
    {
        // setup SgmlReader
        SgmlReader sgmlReader = new SgmlReader();
        sgmlReader.DocType = "HTML";
        sgmlReader.WhitespaceHandling = WhitespaceHandling.All;
        sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower;

        string decodedHtml = HttpUtility.HtmlDecode(html);
        sgmlReader.InputStream = new StringReader(decodedHtml);

        // create document
        XmlDocument xmlDoc = new XmlDocument();
        xmlDoc.PreserveWhitespace = true;
        xmlDoc.XmlResolver = null;
        xmlDoc.Load(sgmlReader);

        return xmlDoc;
    }
    #endif

    

例 13.1. htmlToXmlDoc 的使用范例


        //(1) with xmlns
        string withXmlnsUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2";
        string withXmlnsHtml = getUrlRespHtml(withXmlnsUrl);
        XmlDocument xmlDocWithNs = htmlToXmlDoc(withXmlnsHtml);

        

另外,贴出,完整的示例代码:


    //example code for html parse
    void _demoHtmlParse()
    {
        #if USE_HTML_PARSER_SGML
        //Method 1: use  htmlToXmlDoc
        //(1) with xmlns
        string withXmlnsUrl = "http://fiverr.com/gigs/search?utf8=%E2%9C%93&query=seo&x=15&y=13&page=2";
        string withXmlnsHtml = getUrlRespHtml(withXmlnsUrl);
        XmlDocument xmlDocWithNs = htmlToXmlDoc(withXmlnsHtml);
        //<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        //<html xmlns:og="http://ogp.me/ns#" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns="http://www.w3.org/1999/xhtml" lang="en" xml:lang="en" >
        //  <head>
        //      ...
        XmlNamespaceManager m = new XmlNamespaceManager(xmlDocWithNs.NameTable);
        m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml");
        XmlNode titleNode = xmlDocWithNs.SelectSingleNode("//w3org:h1[@itemprop='name']", m);
        string title = titleNode.InnerText;

        //(2) without xmlns
        string withoutXmlnsUrl = "http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_nav_0";
        //<!DOCTYPE html>
        //<html>
        //<head>
        //...
        string withoutXmlnsHtml = getUrlRespHtml(withoutXmlnsUrl);
        XmlDocument xmlDocNoNs = htmlToXmlDoc(withoutXmlnsHtml);
        XmlNodeList pageNodeList = xmlDocNoNs.SelectNodes("//ol[@class='zg_pagination']/li[@class]");
        #endif

        //common part
        //how to use Attributes
        //XmlNodeList pageNodeList = xmlDoc.SelectNodes("//ol[@class='zg_pagination']/li[@class]");
        //if (pageNodeList != null)
        //{
        //    for (int pageIdx = 1; pageIdx < pageNodeList.Count; pageIdx++)
        //    {
        //        XmlNode curPageNode = pageNodeList[pageIdx];
        //        //<li class="zg_page " id="zg_page2"><a page="2" ajaxUrl="http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_appliances_pg_2/191-0874592-3518518?ie=UTF8&pg=2&ajax=1" href="http://www.amazon.com/gp/new-releases/appliances/ref=zg_bsnr_appliances_pg_2/191-0874592-3518518?ie=UTF8&pg=2">21-40</a></li>
        //        XmlNode ajaxUrlNode = curPageNode.SelectSingleNode(".//a[@href]");
        //        string pageUrl = ajaxUrlNode.Attributes["href"].Value;
        //    }
        //}


        #if USE_HTML_PARSER_HTMLAGILITYPACK
        //Method 2: use htmlToHtmlDoc
        string testUrlWithXmlns = "http://sd.csdn.net/";
        string respHtml = getUrlRespHtml(testUrlWithXmlns);

        //<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
        //<html xmlns="http://www.w3.org/1999/xhtml">
        //<head>
        HtmlAgilityPack.HtmlDocument htmlDoc = htmlToHtmlDoc(respHtml);
        
        //<div class="tabcontent" id="sc1">
        //    <ul>
        //    <li><a href="http://www.csdn.net/article/tag/%E4%BA%A7%E5%93%81" target="_blank">产品</a></li>
        //    ......
        //    <li><a href="http://www.csdn.net/article/tag/%E8%AE%BE%E8%AE%A1" target="_blank">设计</a></li>
        //                        </ul>
        //</div>
        //...
        //<div class="tabcontent" id="sc4">
        //    <ul>
        //          ...
        //    <li><a href="http://www.csdn.net/article/tag/%E6%95%B0%E6%8D%AE%E5%BA%93"  target="_blank">数据库</a></li>
        //                        </ul>
        //</div>
        
        //here, no need to take care the html xmlns
        //is better than SGMLReader
        HtmlNode rootHtmlNode = htmlDoc.DocumentNode;
        HtmlNodeCollection htmlNodes = rootHtmlNode.SelectNodes("//div[@class='tabcontent']");
        foreach (HtmlNode link in htmlNodes)
        {
            HtmlAttribute att = link.Attributes["id"];
            string idHref = att.Value;
        }