/* * [Function] * remove html tag, retain html content * [Input] * html, with tag * * [Output] * pure content, no html tag * * [Note] */ public string htmlRemoveTag(string html) { string filteredHtml = ""; if (!string.IsNullOrEmpty(html)) { HtmlAgilityPack.HtmlDocument htmlDoc = new HtmlAgilityPack.HtmlDocument(); htmlDoc.LoadHtml(html); if (htmlDoc == null) { return ""; } // 1. remove all comments //(1)get all comment nodes using XPATH HtmlNodeCollection commentNodeList = htmlDoc.DocumentNode.SelectNodes("//comment()"); if (commentNodeList != null) { foreach (HtmlNode comment in commentNodeList) { //(2) remove comment node itself comment.ParentNode.RemoveChild(comment); } } //2. get all content foreach (var node in htmlDoc.DocumentNode.ChildNodes) { filteredHtml += node.InnerText; } } return filteredHtml; }
例 13.4. htmlRemoveTag 的使用范例
HtmlAgilityPack.HtmlDocument htmlDoc = crl.htmlToHtmlDoc(googleSearchRespHtml); HtmlNodeCollection liNodeList = htmlDoc.DocumentNode.SelectNodes("//li[@class='g']"); foreach (HtmlNode liNode in liNodeList) { HtmlNode h3ANode = liNode.SelectSingleNode(".//h3[@class='r']/a"); if (h3ANode != null) { googleSearchResultItem singleResultItem = new googleSearchResultItem(); //string titleHtml = h3ANode.InnerHtml; //"Amritanandamayi Math to <em>sponsor charity</em> events - Times Of India" string titleHtml = h3ANode.InnerText; //"Amritanandamayi Math to sponsor charity events - Times Of India" string filteredTitle = crl.htmlRemoveTag(titleHtml);