#------------------------------------------------------------------------------ # check file validation: # open file url to check return info is match or not # with exception support # note: should handle while the file url is redirect # eg : # http://publish.it168.com/2007/0627/images/500754.jpg -> # http://img.publish.it168.com/2007/0627/images/500754.jpg # other special one: # sina pic url: # http://s14.sinaimg.cn/middle/3d55a9b7g9522d474a84d&690 # http://s14.sinaimg.cn/orignal/3d55a9b7g9522d474a84d # the real url is same with above url def isFileValid(fileUrl) : fileIsValid = False; errReason = "Unknown error"; try : #print "original fileUrl=",fileUrl; origFileName = fileUrl.split('/')[-1]; #print "origFileName=",origFileName; #old: https://ie2zeq.bay.livefilestore.com/y1mo7UWr-TrmqbBhkw52I0ii__WE6l2UtMRSTZHSky66-uDxnCdKPr3bdqVrpUcQHcoJLedlFXa43bvCp_O0zEGF3JdG_yZ4wRT-c2AQmJ_TNcWvVZIXfBDgGerouWyx19WpA4I0XQR1syRJXjDNpwAbQ/IMG_5214_thumb[1].jpg #new: https://kxoqva.bay.livefilestore.com/y1mQlGjwNAYiHKoH5Aw6TMNhsCmX2YDR3vPKnP86snuqQEtnZgy3dHkwUvZ61Ah8zU3AGiS4whmm_ADrvxdufEAfMGo56KjLdhIbosn9F34olQ/IMG_5214_thumb%5b1%5d.jpg unquotedOrigFilenname = urllib.unquote(origFileName); #print "unquotedOrigFilenname=",unquotedOrigFilenname lowUnquotedOrigFilename = unquotedOrigFilenname.lower(); #print "lowUnquotedOrigFilename=",lowUnquotedOrigFilename; resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']); # note: Python 2.6 has added timeout support. #print "resp=",resp; realUrl = resp.geturl(); #print "realUrl=",realUrl; newFilename = realUrl.split('/')[-1]; #print "newFilename=",newFilename; #http://blog.sina.com.cn/s/blog_696e50390100ntxs.html unquotedNewFilename = urllib.unquote(newFilename); #print "unquotedNewFilename=",unquotedNewFilename; unquotedLowNewFilename = unquotedNewFilename.lower(); #print "unquotedLowNewFilename=",unquotedLowNewFilename; respInfo = resp.info(); #print "respInfo=",respInfo; respCode = resp.getcode(); #print "respCode=",respCode; # special: # http://116.img.pp.sohu.com/images/blog/2007/5/24/17/24/11355bf42a9.jpg # return no content-length #contentLen = respInfo['Content-Length']; # for redirect, if returned size>0 and filename is same, also should be considered valid #if (origFileName == newFilename) and (contentLen > 0): # for redirect, if returned response code is 200(OK) and filename is same, also should be considered valid #if (origFileName == newFilename) and (respCode == 200): if (lowUnquotedOrigFilename == unquotedLowNewFilename) and (respCode == 200): fileIsValid = True; else : fileIsValid = False; # eg: Content-Type= image/gif, ContentTypes : audio/mpeg # more ContentTypes can refer: http://kenya.bokee.com/3200033.html contentType = respInfo['Content-Type']; errReason = "file url returned info: type=%s, len=%d, realUrl=%s"%(contentType, contentLen, realUrl); except urllib2.URLError,reason : fileIsValid = False; errReason = reason; except urllib2.HTTPError,code : fileIsValid = False; errReason = code; except : fileIsValid = False; errReason = "Unknown error"; # here type(errReason)= <class 'urllib2.HTTPError'>, so just convert it to str errReason = str(errReason); return (fileIsValid, errReason);
#------------------------------------------------------------------------------ # download from fileUrl then save to fileToSave # with exception support # note: the caller should make sure the fileUrl is a valid internet resource/file def downloadFile(fileUrl, fileToSave, needReport = False) : isDownOK = False; downloadingFile = ''; #--------------------------------------------------------------------------- # note: totalFileSize -> may be -1 on older FTP servers which do not return a file size in response to a retrieval request def reportHook(copiedBlocks, blockSize, totalFileSize) : #global downloadingFile if copiedBlocks == 0 : # 1st call : once on establishment of the network connection print 'Begin to download %s, total size=%d'%(downloadingFile, totalFileSize); else : # rest call : once after each block read thereafter print 'Downloaded bytes: %d' % ( blockSize * copiedBlocks); return; #--------------------------------------------------------------------------- try : if fileUrl : downloadingFile = fileUrl; if needReport : urllib.urlretrieve(fileUrl, fileToSave, reportHook); else : urllib.urlretrieve(fileUrl, fileToSave); isDownOK = True; else : print "Input download file url is NULL"; except urllib.ContentTooShortError(msg) : isDownOK = False; except : isDownOK = False; return isDownOK;
例 2.21. downloadFile的使用范例
if dstPicFile and downloadFile(curUrl, dstPicFile) : # replace old url with new url
#------------------------------------------------------------------------------ # manually download fileUrl then save to fileToSave def manuallyDownloadFile(fileUrl, fileToSave) : isDownOK = False; downloadingFile = ''; try : if fileUrl : # 1. find real address #print "fileUrl=",fileUrl; resp = urllib2.urlopen(fileUrl, timeout=gConst['defaultTimeout']); #print "resp=",resp; realUrl = resp.geturl(); # not same with original file url if redirect # if url is invalid, then add timeout can avoid dead respHtml = getUrlRespHtml(realUrl, useGzip=False, timeout=gConst['defaultTimeout']); isDownOK = saveBinDataToFile(respHtml, fileToSave); else : print "Input download file url is NULL"; except urllib.ContentTooShortError(msg) : isDownOK = False; except : isDownOK = False; return isDownOK;
例 2.22. manuallyDownloadFile的使用范例
#if dstPicFile and downloadFile(curUrl, dstPicFile) : # urlretrieve in downloadFile is too slow while download QQ Space Picture # so here use manuallyDownloadFile instead if dstPicFile and manuallyDownloadFile(curUrl, dstPicFile) : # replace old url with new url
#------------------------------------------------------------------------------ # get response from url # note: if you have already used cookiejar, then here will automatically use it # while using rllib2.Request def getUrlResponse(url, postDict={}, headerDict={}, timeout=0, useGzip=False) : # makesure url is string, not unicode, otherwise urllib2.urlopen will error url = str(url); if (postDict) : postData = urllib.urlencode(postDict); req = urllib2.Request(url, postData); req.add_header('Content-Type', "application/x-www-form-urlencoded"); else : req = urllib2.Request(url); if(headerDict) : #print "added header:",headerDict; for key in headerDict.keys() : req.add_header(key, headerDict[key]); defHeaderDict = { 'User-Agent' : gConst['userAgentIE9'], 'Cache-Control' : 'no-cache', 'Accept' : '*/*', 'Connection' : 'Keep-Alive', }; # add default headers firstly for eachDefHd in defHeaderDict.keys() : #print "add default header: %s=%s"%(eachDefHd,defHeaderDict[eachDefHd]); req.add_header(eachDefHd, defHeaderDict[eachDefHd]); if(useGzip) : #print "use gzip for",url; req.add_header('Accept-Encoding', 'gzip, deflate'); # add customized header later -> allow overwrite default header if(headerDict) : #print "added header:",headerDict; for key in headerDict.keys() : req.add_header(key, headerDict[key]); if(timeout > 0) : # set timeout value if necessary resp = urllib2.urlopen(req, timeout=timeout); else : resp = urllib2.urlopen(req); return resp;
例 2.23. getUrlResponse的使用范例
resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip); respHtml = resp.read();
#------------------------------------------------------------------------------ # get response html==body from url #def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=False) : def getUrlRespHtml(url, postDict={}, headerDict={}, timeout=0, useGzip=True) : resp = getUrlResponse(url, postDict, headerDict, timeout, useGzip); respHtml = resp.read(); if(useGzip) : #print "---before unzip, len(respHtml)=",len(respHtml); respInfo = resp.info(); # Server: nginx/1.0.8 # Date: Sun, 08 Apr 2012 12:30:35 GMT # Content-Type: text/html # Transfer-Encoding: chunked # Connection: close # Vary: Accept-Encoding # ... # Content-Encoding: gzip # sometime, the request use gzip,deflate, but actually returned is un-gzip html # -> response info not include above "Content-Encoding: gzip" # eg: http://blog.sina.com.cn/s/comment_730793bf010144j7_3.html # -> so here only decode when it is indeed is gziped data if( ("Content-Encoding" in respInfo) and (respInfo['Content-Encoding'] == "gzip")) : respHtml = zlib.decompress(respHtml, 16+zlib.MAX_WBITS); #print "+++ after unzip, len(respHtml)=",len(respHtml); return respHtml;
例 2.25. getUrlRespHtml的使用范例:带额外参数
modifyUrl = gVal['blogEntryUrl'] + "/blog/submit/modifyblog"; #logging.debug("Modify Url is %s", modifyUrl); #http://hi.baidu.com/wwwhaseecom/blog/item/79188d1b4fa36f068718bf79.html foundSpBlogID = re.search(r"blog/item/(?P<spBlogID>\w+?).html", url); if(foundSpBlogID) : spBlogID = foundSpBlogID.group("spBlogID"); logging.debug("Extracted spBlogID=%s", spBlogID); else : modifyOk = False; errInfo = "Can't extract post spBlogID !"; return (modifyOk, errInfo); newPostContentGb18030 = newPostContentUni.encode("GB18030"); categoryGb18030 = infoDict['category'].encode("GB18030"); titleGb18030 = infoDict['title'].encode("GB18030"); postDict = { "bdstoken" : gVal['spToken'], "ct" : "1", "mms_flag" : "0", "cm" : "2", "spBlogID" : spBlogID, "spBlogCatName_o": categoryGb18030, # old catagory "edithid" : "", "previewImg" : "", "spBlogTitle" : titleGb18030, "spBlogText" : newPostContentGb18030, "spBlogCatName" : categoryGb18030, # new catagory "spBlogPower" : "0", "spIsCmtAllow" : "1", "spShareNotAllow":"0", "spVcode" : "", "spVerifyKey" : "", } headerDict = { # 如果不添加Referer,则返回的html则会出现错误:"数据添加的一般错误" "Referer" : gVal['blogEntryUrl'] + "/blog/modify/" + spBlogID, } respHtml = getUrlRespHtml(modifyUrl, postDict, headerDict);
因为成功登录某网页后,一般都会有对应的cookie返回,所以常用此函数去判断是否成功登录某网页。
#------------------------------------------------------------------------------ # check all cookies in cookiesDict is exist in cookieJar or not def checkAllCookiesExist(cookieNameList, cookieJar) : cookiesDict = {}; for eachCookieName in cookieNameList : cookiesDict[eachCookieName] = False; allCookieFound = True; for cookie in cookieJar : if(cookie.name in cookiesDict) : cookiesDict[cookie.name] = True; for eachCookie in cookiesDict.keys() : if(not cookiesDict[eachCookie]) : allCookieFound = False; break; return allCookieFound;
例 2.26. checkAllCookiesExist的使用范例
#http://www.darlingtree.com/wordpress/archives/242 gVal['cj'] = cookielib.CookieJar(); opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(gVal['cj'])); urllib2.install_opener(opener); resp = urllib2.urlopen(baiduSpaceEntryUrl); loginBaiduUrl = "https://passport.baidu.com/?login"; #username=%D0%C4%C7%E9%C6%DC%CF%A2%B5%D8&password=xxx&mem_pass=on postDict = { 'username' : username, 'password' : password, 'mem_pass' : 'on', }; resp = getUrlResponse(loginBaiduUrl, postDict); # check whether the cookie is OK cookieNameList = ["USERID", "PTOKEN", "STOKEN"]; loginOk = checkAllCookiesExist(cookieNameList, gVal['cj']); if (not loginOk) : logging.error("Login fail for not all expected cookies exist !"); return loginOk;