【背景】
之前写的,用于抓取:
中帖子
(比如:
http://fiverr.com/bizgrowthcoach/provide-a-startup-checklist-and-project-plan
)的评论。
注:
此代码是之前该网站改版之前写的;
且是没有完成的;
只是贴出来,供参考而已->其中有些关于SgmlReader等函数的使用,可供参考;
【ScrapeFiverrComments代码分享】
1.截图:
2.项目代码下载:
ScrapeFiverrComments_2013-02-28_uncompleted.7z
3.代码分享:
(1)frmScrapeFiverrComments.cs
/* * [File] * frmScrapeFiverrComments.cs * * [Function] * fiverr.com comments scrapper * * [Note] * * [Update] * 2013-02-28 * * [Author] * Crifan Li * * [Contact] * https://www.crifan.com/contact_me/ * */ using System; using System.Collections.Generic; using System.ComponentModel; using System.Data; using System.Drawing; using System.Text; using System.Windows.Forms; using System.Web; using System.Xml; using Sgml; using System.IO; using Excel = Microsoft.Office.Interop.Excel; using Microsoft.Office.Interop.Excel; /* * icons: * * search/find * http://www.easyicon.cn/icondetail/106/ * * stop * http://www.easyicon.cn/icondetail/568811/ * * crawler * http://www.easyicon.cn/icondetail/13685/ * * login * http://www.easyicon.cn/icondetail/500811/ * * send mail * http://www.easyicon.cn/icondetail/538560/ */ namespace ScrapeFiverrComments { public partial class frmScrapeFiverrComments : Form { public crifanLib crifanLib; static int constPageGigNumber = 40; public frmScrapeFiverrComments() { AppDomain.CurrentDomain.AssemblyResolve += new ResolveEventHandler(CurrentDomain_AssemblyResolve); InitializeComponent(); crifanLib = new crifanLib(); } System.Reflection.Assembly CurrentDomain_AssemblyResolve(object sender, ResolveEventArgs args) { string dllName = args.Name.Contains(",") ? args.Name.Substring(0, args.Name.IndexOf(',')) : args.Name.Replace(".dll", ""); dllName = dllName.Replace(".", "_"); if (dllName.EndsWith("_resources")) return null; System.Resources.ResourceManager rm = new System.Resources.ResourceManager(GetType().Namespace + ".Properties.Resources", System.Reflection.Assembly.GetExecutingAssembly()); byte[] bytes = (byte[])rm.GetObject(dllName); return System.Reflection.Assembly.Load(bytes); } private void initDataGridView() { dgvCmtAuthorList.ColumnCount = 2; dgvCmtAuthorList.RowHeadersWidth = 80; dgvCmtAuthorList.RowHeadersDefaultCellStyle.Alignment = DataGridViewContentAlignment.MiddleCenter; dgvCmtAuthorList.RowHeadersWidthSizeMode = DataGridViewRowHeadersWidthSizeMode.DisableResizing; dgvCmtAuthorList.AutoSizeColumnsMode = DataGridViewAutoSizeColumnsMode.Fill; //(1)username dgvCmtAuthorList.Columns[0].HeaderText = "Username"; dgvCmtAuthorList.Columns[0].Width = 160; //(2)profile url dgvCmtAuthorList.Columns[1].HeaderText = "Profile Url"; dgvCmtAuthorList.Columns[1].Width = grbCmtAuthorList.Width - dgvCmtAuthorList.RowHeadersWidth - dgvCmtAuthorList.Columns[0].Width - 20; } private void frmScrapeFiverrComments_Load(object sender, EventArgs e) { initDataGridView(); grbLogin.Enabled = false; //txbMessageToSend.Enabled = false; //btnSendMessage.Enabled = false; } XmlDocument htmlToXmlDoc(string html) { // setup SgmlReader Sgml.SgmlReader sgmlReader = new Sgml.SgmlReader(); sgmlReader.DocType = "HTML"; sgmlReader.WhitespaceHandling = WhitespaceHandling.All; sgmlReader.CaseFolding = Sgml.CaseFolding.ToLower; //sgmlReader.InputStream = reader; sgmlReader.InputStream = new StringReader(html); // create document XmlDocument doc = new XmlDocument(); doc.PreserveWhitespace = true; doc.XmlResolver = null; doc.Load(sgmlReader); return doc; } //gig comment author info public struct gigCmtAuthorInfo { public string username; public string profileUrl; }; private void btnScrape_Click(object sender, EventArgs e) { //http://fiverr.com/bizgrowthcoach/provide-a-startup-checklist-and-project-plan string curGigUrl = txbGigUrl.Text; bool isFirstPage = true; bool needGetMorePage = true; int curPageNumber = 0; string gigId = ""; while (needGetMorePage) { string gigUrlRespHtml = ""; if (isFirstPage) { gigUrlRespHtml = crifanLib.getUrlRespHtml(curGigUrl); } else { //string gigUrlRespHtml = crifanLib.getUrlRespHtml(curGigUrl, headerDict); } XmlDocument xmlDoc = htmlToXmlDoc(gigUrlRespHtml); XmlNamespaceManager m = new XmlNamespaceManager(xmlDoc.NameTable); m.AddNamespace("w3org", "http://www.w3.org/1999/xhtml"); //<li class="rating-block "> // <div class="userimage"> // <img src="http://dfkno3dtzeq4c.cloudfront.net/assets/02-mini-2bb551afad6a7740ad73314482189dd7.gif" width="24px" height="24px" class="true" alt="azza1200" /> // </div> // <div class= " rating-text"> // <div> // <div class="rater-username"> // <a href="/azza1200" rel="nofollow">azza1200</a> // <span class="time-ago titled" title="1361846351"></span> // </div> // <div class="comment-block"> // <div class="rating-icon"> // <img alt="thumb down - negative" src="http://dfkno3dtzeq4c.cloudfront.net/assets/thumb_down-9ff2828220cbb43e26ad5b4fa0b0fe88.png" /> // </div> // <div class="rating-comment"> // Terrible value. Seller is arrogant and unprofessional as well. Advise Google-ing to get a better plan than this rubbish he is selling. Poor form // </div> // </div> // </div> // </div> // <div class="clear"></div> //</li> XmlNodeList ratingBlockList = xmlDoc.SelectNodes("//w3org:li[@class='rating-block ']", m); if (ratingBlockList != null) { if (ratingBlockList.Count < constPageGigNumber) { needGetMorePage = false; } foreach (XmlNode ratingBlockNode in ratingBlockList) { gigCmtAuthorInfo cmtAuthorInfo = new gigCmtAuthorInfo(); //1. user name //2. profile url //<div class="rater-username"> // <a href="/azza1200" rel="nofollow">azza1200</a> // <span class="time-ago titled" title="1361846351"></span> //</div> XmlNode rateUsernameNode = ratingBlockNode.SelectSingleNode(".//w3org:div[@class='rater-username']", m); string username = ""; string profileUrl = ""; if (rateUsernameNode != null) { XmlNode aNode = rateUsernameNode.SelectSingleNode(".//w3org:a[@rel|href]", m); if (aNode != null) { username = aNode.InnerText; string href = aNode.Attributes["href"].Value; profileUrl = "http://fiverr.com" + href; //http://fiverr.com/azza1200 cmtAuthorInfo.username = username; cmtAuthorInfo.profileUrl = profileUrl; storeCommentAuthorInfo(cmtAuthorInfo); //update UI System.Windows.Forms.Application.DoEvents(); } }//if (rateUsernameNode != null) }//foreach (XmlNode ratingBlockNode in ratingBlockList) //update for next page if (isFirstPage) { isFirstPage = false; curPageNumber = 1; //<form accept-charset="UTF-8" action="http://fiverr.com/purchases?gig_id=748824" class="order-form" id="start_order_form_748824" method="post"> if (crifanLib.extractSingleStr(@"action=""http://fiverr\.com/purchases\?gig_id=(\d+)""", gigUrlRespHtml, out gigId)) { } } else { curPageNumber++; } int offsetNumber = 40 * curPageNumber; //http://fiverr.com/gigs/748824/load_ratings?offset=40&show_work_sample=false string nextPageGigUrl = "http://fiverr.com/gigs/" + gigId + "/load_ratings?offset=" + offsetNumber.ToString() + "&show_work_sample=false"; //curGigUrl = nextPageGigUrl; string titlePart = ""; if (crifanLib.extractSingleStr(@"http://fiverr\.com/\w+/([\w-]+)", curGigUrl, out titlePart)) { //http://fiverr.com/gigs/provide-a-startup-checklist-and-project-plan?offset=40 nextPageGigUrl = "http://fiverr.com/gigs/" + titlePart + "?offset=" + offsetNumber.ToString(); curGigUrl = nextPageGigUrl; } }//if (ratingBlockList != null) else { needGetMorePage = false; } } } private bool userNotExist(string username) { bool notExist = true; for(int rowIdx = 0; rowIdx <= dgvCmtAuthorList.Rows.Count -1; rowIdx++) { string eachUsername = dgvCmtAuthorList.Rows[rowIdx].Cells[0].Value.ToString(); if (eachUsername.Equals(username)) { notExist = false; break; } } return notExist; } void storeCommentAuthorInfo(gigCmtAuthorInfo cmtAuthorInfo) { if (userNotExist(cmtAuthorInfo.username)) { dgvCmtAuthorList.Rows.Add( cmtAuthorInfo.username, cmtAuthorInfo.profileUrl); dgvCmtAuthorList.Rows[dgvCmtAuthorList.Rows.Count - 1].Selected = true; dgvCmtAuthorList.FirstDisplayedScrollingRowIndex = dgvCmtAuthorList.Rows.Count - 1; for (int count = 0; (count <= (dgvCmtAuthorList.Rows.Count - 1)); count++) { dgvCmtAuthorList.Rows[count].HeaderCell.Value = String.Format("{0}", count + 1); } } return; } private void btnSaveAll_Click(object sender, EventArgs e) { Excel.Application xlApp = new Excel.Application(); Excel.Workbook xlWorkBook; Excel.Worksheet xlWorkSheet; object misValue = System.Reflection.Missing.Value; xlApp = new Excel.ApplicationClass(); xlWorkBook = xlApp.Workbooks.Add(misValue); xlWorkSheet = (Excel.Worksheet)xlWorkBook.Worksheets.get_Item(1); int i = 0; int j = 0; //save header for (i = 0; i <= dgvCmtAuthorList.ColumnCount - 1; i++) { xlWorkSheet.Cells[0 + 1, i + 1] = dgvCmtAuthorList.Columns[i].HeaderText; } //save cells for (i = 0; i <= dgvCmtAuthorList.RowCount - 1; i++) { for (j = 0; j <= dgvCmtAuthorList.ColumnCount - 1; j++) { DataGridViewCell cell = dgvCmtAuthorList[j, i]; xlWorkSheet.Cells[i + 2, j + 1] = cell.Value; } } //formatting //header to bold Range headerRow = xlWorkSheet.get_Range("1:1", System.Type.Missing); headerRow.Font.Bold = true; //auto adjust column width (according to content) Range allColumn = xlWorkSheet.Columns; allColumn.AutoFit(); string currentPath = System.Environment.CurrentDirectory; string outputFilename = "ScrapedGigCommentsAuthorList.xls"; string fullFilename = Path.Combine(currentPath, outputFilename); //xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, misValue, misValue, misValue, misValue, misValue); xlWorkBook.SaveAs(fullFilename, Excel.XlFileFormat.xlWorkbookNormal, misValue, misValue, misValue, misValue, Excel.XlSaveAsAccessMode.xlExclusive, XlSaveConflictResolution.xlLocalSessionChanges, misValue, misValue, misValue, misValue); xlWorkBook.Close(true, misValue, misValue); xlApp.Quit(); releaseObject(xlWorkSheet); releaseObject(xlWorkBook); releaseObject(xlApp); System.Diagnostics.Process.Start("Explorer.exe", "/select," + fullFilename); } private void releaseObject(object obj) { try { System.Runtime.InteropServices.Marshal.ReleaseComObject(obj); obj = null; } catch (Exception ex) { obj = null; MessageBox.Show("Exception Occured while releasing object " + ex.ToString()); } finally { GC.Collect(); } } private void btnClearAll_Click(object sender, EventArgs e) { dgvCmtAuthorList.Rows.Clear(); } private void btnLogin_Click(object sender, EventArgs e) { bool loginOk = loginFiverrCom(txbUsername.Text, txbPassword.Text); if (loginOk) { txbMessageToSend.Enabled = true; btnSendMessage.Enabled = true; } else { txbMessageToSend.Enabled = false; btnSendMessage.Enabled = false; } } private bool loginFiverrCom(string username, string password) { bool loginOk = false; return loginOk; } } }
【总结】
转载请注明:在路上 » 【代码分享】C#代码:ScrapeFiverrComments – 抓取fiverr.com中帖子的评论