// valid charset:"GB18030"/"UTF-8", invliad:"UTF8" public string getUrlRespHtml(string url, Dictionary<string, string> headerDict = defHeaderDict, string charset = defCharset, Dictionary<string, string> postDict = defPostDict, int timeout = defTimeout, string postDataStr = defPostDataStr, int readWriteTimeout = defReadWriteTimeout) { string respHtml = ""; HttpWebResponse resp = getUrlResponse(url, headerDict, postDict, timeout, postDataStr, readWriteTimeout); //long realRespLen = resp.ContentLength; if (resp != null) { StreamReader sr; Stream respStream = resp.GetResponseStream(); if (!string.IsNullOrEmpty(charset)) { Encoding htmlEncoding = Encoding.GetEncoding(charset); sr = new StreamReader(respStream, htmlEncoding); } else { sr = new StreamReader(respStream); } try { respHtml = sr.ReadToEnd(); //while (!sr.EndOfStream) //{ // respHtml = respHtml + sr.ReadLine(); //} //string curLine = ""; //while ((curLine = sr.ReadLine()) != null) //{ // respHtml = respHtml + curLine; //} ////http://msdn.microsoft.com/zh-cn/library/system.io.streamreader.peek.aspx //while (sr.Peek() > -1) //while not error or not reach end of stream //{ // respHtml = respHtml + sr.ReadLine(); //} //respStream.Close(); //sr.Close(); //resp.Close(); } catch (Exception ex) { //【未解决】C#中StreamReader中遇到异常:未处理ObjectDisposedException,无法访问已关闭的流 //http://www.crifan.com/csharp_streamreader_unhandled_exception_objectdisposedexception_cannot_access_closed_stream //System.ObjectDisposedException respHtml = ""; } finally { if (respStream != null) { respStream.Close(); } if (sr != null) { sr.Close(); } if (resp != null) { resp.Close(); } } } return respHtml; }
很明显可以看出,此处的getUrlRespHtml的很多参数,和前面介绍的第 9.5 节 “获得Url地址的响应:getUrlResponse”非常类似。
此处,针对于getUrlRespHtml的参数,也要再解释一下:
其他参数,包括url,headerDict,postDict,timeout,postDataStr,readWriteTimeout,都和getUrlResponse的参数含义相同。所以不再赘述。
另外还有参数,需要解释一下:
charset表示返回的网页内容,用何种字符编码去解码。
charset默认值是defCharset
defCharset的值是:
private const string defCharset = null;
此处,之所以defCharset的值,不是我们所常见的GBK,UTF-8等等,是因为此处是为了支持,当不设置charset时,就不去尝试用某种编码去解码通过StreamReader所读取出来的内容。
这样的就可以获得,原始的,返回的html,可以供有需要的人,后期进行自己的处理,比如自己去解码等等。
getUrlRespHtml内部,已经实现了足够多的,相对比较复杂的功能,对此,需要详细解释一下:
getUrlRespHtml内部调用getUrlResponse,内部已经加上了对应的User-Agent了。
当然默认使用的是IE8的User-Agent,相关部分的代码是:
//IE7 const string constUserAgent_IE7_x64 = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)"; //IE8 const string constUserAgent_IE8_x64 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E"; //IE9 const string constUserAgent_IE9_x64 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; // x64 const string constUserAgent_IE9_x86 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"; // x86 //Chrome const string constUserAgent_Chrome = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4"; //Mozilla Firefox const string constUserAgent_Firefox = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6"; private string gUserAgent; gUserAgent = constUserAgent_IE8_x64; req.UserAgent = gUserAgent;
所以,不会出现,被服务器当做普通的机器人或蜘蛛爬虫的情况。
内部相关代码:
req.AllowAutoRedirect = true;
默认是启用了自动跳转的。
如果想要禁止自动跳转,可以去给headerDict中加上对应的"AllowAutoRedirect"为"false"的参数
更多使用实例,详见后续的例子。
内部相关代码:
req.Headers["Accept-Encoding"] = "gzip, deflate"; //req.AutomaticDecompression = DecompressionMethods.GZip; req.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
即前面所解释的参数:第 9.5.1.4 节 “getUrlResponse的参数:timeout”,指的是网络方面的超时,和GetResponse和GetRequestStream有关
内部相关部分的代码是:
if (timeout > 0) { req.Timeout = timeout; }
即前面所解释的参数:第 9.5.1.6 节 “getUrlResponse的参数:readWriteTimeout”,指的是StreamReader或StreamWriter的读写超时,和readLine之类的有关。
内部相关部分的代码是:
if (readWriteTimeout > 0) { //default ReadWriteTimeout is 300000=300 seconds = 5 minutes !!! //too long, so here change to 300000 = 30 seconds //for support TimeOut for later StreamReader's ReadToEnd req.ReadWriteTimeout = readWriteTimeout; }
此处已经支持,getUrlRespHtml内部,自动处理cookie。
内部相关部分的代码是:
CookieCollection curCookies = null; curCookies = new CookieCollection(); if (curCookies != null) { req.CookieContainer = new CookieContainer(); req.CookieContainer.PerDomainCapacity = 40; // following will exceed max default 20 cookie per domain req.CookieContainer.Add(curCookies); } resp = (HttpWebResponse)req.GetResponse(); updateLocalCookies(resp.Cookies, ref curCookies);
其中,注意到,设置了最大支持40个cookie,是因为,之前折腾InsertSkydriveFiles期间,遇到相对极端的情况:cookie超过默认的20多个,一个CookieContainer都装不下了,所以才改为40个,以便支持如此多的cookie。
getUrlRespHtml的参数够多,用法,也有很多种。
此处,就来通过例子来说明,如何使用此getUrlResponse函数。
getUrlRespHtml最常用,也是最简单的用法,就是:直接传入url,然后获得返回的html
代码如下:
例 9.7. getUrlRespHtml用法示例:只传入url而获得html
string mainJsUrl = "http://image.songtaste.com/inc/main.js"; string respHtmlMainJs = getUrlRespHtml(mainJsUrl);
其中,getUrlRespHtml内部,会自动帮你处理各种细节,比如cookie,header中的User-Agent等等内容,而你就直接可以得到对应返回的html了。
很多时候,在折腾抓取网页和模拟登陆时,都会遇到,需要额外再指定一些header,用于实现一些特定的目的。
比如,添加对应的Referer,以便成功模拟网页逻辑,获得所需返回的内容的:
string tmpRespHtml = ""; Dictionary<string, string> headerDict; //(1)to get cookies string pageRankMainUrl = "http://pagerank.webmasterhome.cn/"; tmpRespHtml = getUrlRespHtml(pageRankMainUrl); //(2)ask page rank string firstBaseUrl = "http://pagerank.webmasterhome.cn/?domain="; //http://pagerank.webmasterhome.cn/?domain=answers.yahoo.com string firstWholeUrl = firstBaseUrl + noHttpPreDomainUrl; headerDict = new Dictionary<string, string>(); headerDict.Add("referer", pageRankMainUrl); tmpRespHtml = getUrlRespHtml(firstWholeUrl, headerDict: headerDict);
如第 9.6.2.2 节 “默认是允许自动跳转的”所述,默认是启用了自动跳转的,想要禁止自动跳转,可以通过header去设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("AllowAutoRedirect", "false"); string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
此处默认的Accept是"*/*",如果想要指定不同的类型,可以手动通过header去设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("Accept", "text/html"); string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
关于Accept更多可能的取值,自己参考官网的解释:14.1 Accept
此处默认的KeepAlive是true的,如果不想继续保持连接,则可以通过header去禁止:
Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("Keep-Alive", "false"); string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
此处默认没有指定Accept-Language,有需要的话,可以去通过header设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("Accept-Language", "en-US"); //"zh-CN" string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
关于Accept-Language更多可能的取值,自己参考官网的解释:14.4 Accept-Language
如第 9.6.2.1 节 “内部已默认指定了IE8的User-Agent”所述,我此处的getUrlRespHtml,默认添加的User-Agent是IE8的。
如果有需要,你可以自己换成别的,比如Firefox的User-Agent:
//Mozilla Firefox const string constUserAgent_Firefox = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6"; Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("User-Agent", constUserAgent_Firefox); string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
其中,关于各种浏览器的User-Agent,你可以自己去网络上找到。也可以参考我代码中的值:
//IE7 const string constUserAgent_IE7_x64 = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E)"; //IE8 const string constUserAgent_IE8_x64 = "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; InfoPath.3; .NET4.0C; .NET4.0E"; //IE9 const string constUserAgent_IE9_x64 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)"; // x64 const string constUserAgent_IE9_x86 = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"; // x86 //Chrome const string constUserAgent_Chrome = "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4"; //Mozilla Firefox const string constUserAgent_Firefox = "Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:1.9.2.6) Gecko/20100625 Firefox/3.6.6";
此处默认情况下,对于GET,没有指定ContentType,对于POST,已经指定了"application/x-www-form-urlencoded"。
如果你有别的特殊需求,需要设置ContentType的话,可以去通过header设置:
Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("Content-Type", "text/plain"); string respHtml = getUrlRespHtml(yourUrl, headerDict: headerDict);
关于Content-Type更多可能的取值,自己参考官网的解释:14.17 Content-Type
在很多时候,都需要设置,某些其他的,非标准的,header信息,则也可以去通过header设置。
比如,之前折腾InsertSkydriveFiles时所用到的:
string createFolerUrl = "https://skydrive.live.com/API/2/AddFolder?lct=1"; Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("Accept", "application/json"); headerDict.Add("Referer", constSkydriveUrl); headerDict.Add("Canary", gCanary); headerDict.Add("Appid", gAppid); headerDict.Add("X-Requested-With", "XMLHttpRequest"); headerDict.Add("Cache-Control", "no-cache"); string postDataStr = genCreateFolderPostData(folderName, parentId, cid); respJson = getUrlRespHtml(createFolerUrl, headerDict:headerDict, postDataStr:postDataStr);
有时候,已经网页是某种编码的,所以为了正确解析返回的html,需要指定对应的字符编码charset:
string songtasteUserUrl = "http://www.songtaste.com/user/351979/"; string songtasteHtmlCharset = "GB18030"; string respHtmlUnicode = getUrlRespHtml(songtasteUserUrl, charset:songtasteHtmlCharset);
即可返回对应的,已经解码后的,Unicode字符串了。
如果你觉得默认的网络超时时间30秒不合适,可以自己另外指定,比如:
int timeoutInMilliSec = 10 * 1000; string respHtml = getUrlRespHtml(someUrl, timeout:timeoutInMilliSec);
如果你觉得默认的Stream的读写超时时间30秒不合适,可以自己另外指定,比如:
int streamRdWrTimeout = 20 * 1000; string respHtml = getUrlRespHtml(someUrl, readWriteTimeout:streamRdWrTimeout);
在模拟登陆时,往往会用到POST,会传递对应的POST数据
此处,主要有两种方式传递POST数据:
一般都是通过postDict传递数据进去
然后内部通过quoteParas转换为对应的post data,是以"&"为分隔符的。
个别情况下,特殊的情况下,会用到此postDataStr
其传递的post数据,是以换行为分隔符的。此时需要,不设置postDict(默认为null),然后设置对应的postDataStr即可。
下面,针对两种情况,都给出对应的多个示例来说明如何使用:
比如,之前折腾:第 9.11 节 “查找获得域名的Page Rank:getDomainPageRank”时所用到的:
//Method 1: use http://www.pagerankme.com/ queryUrl = "http://www.pagerankme.com/"; postDict = new Dictionary<string, string>(); postDict.Add("url", domainUrl); respHtml = getUrlRespHtml(queryUrl, postDict: postDict);
比如,之前折腾:DownloadSongtasteMusic时所用到的:
const string stHtmlCharset = "GB18030"; Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict.Add("x-requested-with", "XMLHttpRequest"); // when click play // access http://songtaste.com/time.php, post data: //str=5bf271ccad05f95186be764f725e9aaf07e0c7791a89123a9addb2a239179e64c91834c698a9c5d82f1ced3fe51ffc51&sid=3015123&t=0 Dictionary<string, string> postDict = new Dictionary<string, string>(); postDict.Add("str", str); postDict.Add("sid", sid); postDict.Add("t", "0"); string getRealAddrUrl = "http://songtaste.com/time.php"; songInfo.realAddr = crl.getUrlRespHtml(getRealAddrUrl, headerDict:headerDict, postDict:postDict, charset:stHtmlCharset);
比如,之前折腾:【未解决】通过百度API上传单个文件出现403的错误时所遇到的就是,post数据是以换行符非分隔符的,所以就要去直接设置对应的postDataStr:
string[] token = respTokenJson.Split(','); string tokenStr = token[2].Split(':')[1].Trim('"'); byte[] fileBytes = null; string filename = "fileForUpload2.txt"; string fullFilePath = @"d:\" + filename; using (FileStream fs = new FileStream(fullFilePath, FileMode.Open)) { fileBytes = new byte[fs.Length]; fs.Read(fileBytes, 0, fileBytes.Length); } StringBuilder buffer = new StringBuilder(); char[] fileCh = new char[fileBytes.Length]; for (int i = 0; i < fileBytes.Length; i++) fileCh[i] = (char)fileBytes[i]; buffer.Append(fileCh); //postDict = new Dictionary<string, string>(); //postDict.Add("file", buffer.ToString()); string postDataStr = buffer.ToString(); string uploadSingleFileUrl = "https://pcs.baidu.com/rest/2.0/pcs/file?"; Dictionary<string, string> queryParaDict = new Dictionary<string, string>(); queryParaDict.Add("method", "upload"); queryParaDict.Add("access_token", tokenStr); queryParaDict.Add("path", "/apps/测试应用/" + filename); uploadSingleFileUrl += crifanLib.quoteParas(queryParaDict); curCookies = crifanLib.getCurCookies(); newCookies = new CookieCollection(); foreach (Cookie ck in curCookies) { if (ck.Name == "BAIDUID" || ck.Name == "BDUSS") { ck.Domain = "pcs.baidu.com"; } newCookies.Add(ck); } crifanLib.setCurCookies(newCookies); string boundaryValue = "----WebKitFormBoundaryS0JIa4uHF7yHd8xJ"; string boundaryExpression = "boundary=" + boundaryValue; headerDict = new Dictionary<string, string>(); headerDict.Add("Pragma", "no-cache"); headerDict.Add("Content-Type", "multipart/form-data;" + " " + boundaryExpression); postDataStr = boundaryValue + "\r\n" + "Content-Disposition: form-data; name=\"file\"" + "\r\n" + postDataStr + "\r\n" + boundaryValue; //string str = crifanLib.getUrlRespHtml( // string.Format(@"https://pcs.baidu.com/rest/2.0/pcs/file?method=upload&path=%2Fapps%2F%E6%B5%8B%E8%AF%95%E5%BA%94%E7%94%A8%2F78.jpg&access_token={0}", tokenStr), // headerDict, postDict); string respJson = crifanLib.getUrlRespHtml(uploadSingleFileUrl, headerDict:headerDict, postDataStr: postDataStr);
比如,之前折腾:【记录】给BlogsToWordPress添加支持导出网易的心情随笔时所遇到的就是,post数据是以换行符非分隔符的,所以就要去直接设置对应的postDataStr:
string postDataStr = "callCount=1" + "\r\n" + "scriptSessionId=${scriptSessionId}187" + "\r\n" + "c0-scriptName=BlogBeanNew" + "\r\n" + "c0-methodName=getBlogs" + "\r\n" + "c0-id=0" + "\r\n" + "c0-param0=" + "number:" + userId + "\r\n" + "c0-param1=" + "number:" + startBlogIdx + "\r\n" + "c0-param2=" + "number:" + onceGetNum; //http://api.blog.163.com/ni_chen/dwr/call/plaincall/BlogBeanNew.getBlogs.dwr string getBlogsDwrMainUrl = blogApi163 + "/" + blogUser + "/" + "dwr/call/plaincall/BlogBeanNew.getBlogs.dwr"; Dictionary<string, string> headerDict = new Dictionary<string, string>(); headerDict = new Dictionary<string, string>(); //Referer http://api.blog.163.com/crossdomain.html?t=20100205 headerDict.Add("Referer", "http://api.blog.163.com/crossdomain.html?t=20100205"); headerDict.Add("Content-Type", "text/plain"); string blogsRespHtml = getUrlRespHtml(getBlogsDwrMainUrl, headerDict:headerDict, postDataStr:postDataStr);