折腾:
期间,已经可以用代码:
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2019-03-27 15:35:20 # Project: XiaohuashengApp from pyspider.libs.base_handler import * import os import json import codecs import base64 import gzip import copy import time # import datetime from datetime import datetime, timedelta from hashlib import md5 ###################################################################### # Const ###################################################################### gServerPort = " http://www.xiaohuasheng.cn:83 " SelfReadingUrl = " http://www.xiaohuasheng.cn:83/Reading.svc/selfReadingBookQuery2 " ParentChildReadingUrl = " http://www.xiaohuasheng.cn:83/Reading.svc/parentChildReadingBookQuery2 " # ViewEnglishSeries2UrlPrefix = " http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2 " RESPONSE_OK = "1001" ###################################################################### # Config & Settings ###################################################################### OutputFolder = "/Users/crifan/dev/dev_root/company/xxx/projects/crawler_projects/crawler_xiaohuasheng_app/output" DefaultPageSize = 10 gUserAgentNoxAndroid = "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; A0001 Build/KOT49H) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" gUserId = "1134723" gAuthorization = """NSTp9~)NwSfrXp@\\""" gUserToken = “40d2267f-x-x-x-xxx" gSecretKey = “AyGt7ohMR!yyy#N" gHeaders = { "Host": " www.xiaohuasheng.cn:83 ", "User-Agent": gUserAgentNoxAndroid, "Content-Type": "application/json", "userId": gUserId, "Authorization": gAuthorization, # "timestamp": gTimestamp, # "signature": gSignature, "cookie": " ASP.NET_SessionId=dxf3obxgn5t4w350xp3icgy0 ", # "Cookie2": "$Version=1", "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "cache-control": "no-cache", "Connection": "keep-alive", # "content-length": "202", } ###################################################################### # Common Util Functions ###################################################################### def getCurTimestamp(withMilliseconds=False): """ get current time's timestamp (default)not milliseconds -> 10 digits: 1351670162 with milliseconds -> 13 digits: 1531464292921 """ curDatetime = datetime.now() return datetimeToTimestamp(curDatetime, withMilliseconds) def datetimeToTimestamp(datetimeVal, withMilliseconds=False) : """ convert datetime value to timestamp eg: "2006-06-01 00:00:00.123" -> 1149091200 if with milliseconds -> 1149091200123 :param datetimeVal: :return: """ timetupleValue = datetimeVal.timetuple() timestampFloat = time.mktime(timetupleValue) # 1531468736.0 -> 10 digits timestamp10DigitInt = int(timestampFloat) # 1531468736 timestampInt = timestamp10DigitInt if withMilliseconds: microsecondInt = datetimeVal.microsecond # 817762 microsecondFloat = float(microsecondInt)/float(1000000) # 0.817762 timestampFloat = timestampFloat + microsecondFloat # 1531468736.817762 timestampFloat = timestampFloat * 1000 # 1531468736817.7621 -> 13 digits timestamp13DigitInt = int(timestampFloat) # 1531468736817 timestampInt = timestamp13DigitInt return timestampInt def extractSuffix(fileNameOrUrl): """ extract file suffix from name or url eg:-> mp4 15365514894833.srt -> srt """ return fileNameOrUrl.split('.')[-1] def createFolder(folderFullPath): """ create folder, even if already existed Note: for Python 3.2+ """ os.makedirs(folderFullPath, exist_ok=True) print("Created folder: %s" % folderFullPath) def saveDataToFile(fullFilename, binaryData): """save binary data info file""" with open(fullFilename, 'wb') as fp: fp.write(binaryData) fp.close() print("Complete save file %s" % fullFilename) def saveJsonToFile(fullFilename, jsonValue): """save json dict into file""" with codecs.open(fullFilename, 'w', encoding="utf-8") as jsonFp: json.dump(jsonValue, jsonFp, indent=2, ensure_ascii=False) print("Complete save json %s" % fullFilename) def loadJsonFromFile(fullFilename): """load and parse json dict from file""" with codecs.open(fullFilename, 'r', encoding="utf-8") as jsonFp: jsonDict = json.load(jsonFp) print("Complete load json from %s" % fullFilename) return jsonDict ###################################################################### # Main ###################################################################### class Handler(BaseHandler): crawl_config = { } def on_start(self): jValueTemplateSelfReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"grades\":\"\",\"levels\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}" jValueTemplateParentChildReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"theStageOfTheChild\":\"\",\"parentalEnglishLevel\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}" paramDictSelfReading = { "curUrl": SelfReadingUrl, "offset": 0, "limit": DefaultPageSize, "jValueTemplate": jValueTemplateSelfReading } self.getBookQuery2(paramDictSelfReading) paramDictParentChildReading = { "curUrl": ParentChildReadingUrl, "offset": 0, "limit": DefaultPageSize, "jValueTemplate": jValueTemplateParentChildReading } self.getBookQuery2(paramDictParentChildReading) def getBookQuery2(self, curParamDict): print("getBookQuery2: curParamDict=%s" % curParamDict) curUrl = curParamDict["curUrl"] jValueTemplate = curParamDict["jValueTemplate"] offset = curParamDict["offset"] limit = curParamDict["limit"] jValueStr = jValueTemplate % (gUserId, offset, limit) jcJsonDict = { "J": jValueStr, "C": 0 } jcJsonDictStr = json.dumps(jcJsonDict) curParamDict["jValueStr"] = jValueStr curParamDict["jcJsonDict"] = jcJsonDict curParamDict["jcJsonDictStr"] = jcJsonDictStr curHeaders = copy.deepcopy(gHeaders) curTimestampInt = getCurTimestamp() curTimestampStr = str(curTimestampInt) curHeaders["timestamp"] = curTimestampStr curSignature = self.generateSignature(curTimestampInt, jValueStr) curHeaders["signature"] = curSignature self.crawl(curUrl, method="POST", # data=jcJsonDict, data= jcJsonDictStr, # callback=curCallback, callback=self.getBookQuery2Callback, headers=curHeaders, save=curParamDict ) def generateSignature(self, timestampInt, jValueOrUrlEndpoint): # print("generateSignature: timestampInt=%d, jValueOrUrlEndpoint=%s" % (timestampInt, jValueOrUrlEndpoint)) # userId = "1134723" userId = gUserId timestamp = "%s" % timestampInt # localObject = "/Reading.svc/parentChildReadingBookQuery2" # localObject = jValueOrUrlEndpoint # userToken = "40d2267f-359e-4526-951a-66519e5868c3" userToken = gUserToken # fixedSault = “AyGt7ohMR!xx#N" # secretKey = “AyGt7ohMR!xx#N" secretKey = gSecretKey # strToCalc = userId + timestamp + localObject + jValueOrUrlEndpoint + fixedSault # strToCalc = timestamp + localObject + fixedSault strToCalc = userId + timestamp + jValueOrUrlEndpoint + userToken + secretKey # print("strToCalc=%s" % strToCalc) encodedStr = strToCalc.encode() # encodedStr = strToCalc.encode("UTF-8") # print("encodedStr=%s" % encodedStr) md5Result = md5(encodedStr) # print("md5Result=%s" % md5Result) # md5Result=<md5 HASH object @ 0x1044f1df0> # md5Result = md5() # md5Result.update(strToCalc) # md5Digest = md5Result.digest() # print("md5Digest=%s" % md5Digest) # # print("len(md5Digest)=%s" % len(md5Digest)) md5Hexdigest = md5Result.hexdigest() # print("md5Hexdigest=%s" % md5Hexdigest) # print("len(md5Hexdigest)=%s" % len(md5Hexdigest)) # md5Hexdigest=c687d5dfa015246e6bdc6b3c27c2afea # print("md5=%s from %s" % (md5Hexdigest, strToCalc)) return md5Hexdigest # return md5Digest def extractResponseData(self, respJson): """ { "C": 2, "J": "H4sIAA.......AA=", "M": "1001", "ST": null } """ # respJson = json.loads(respJson) respM = respJson["M"] if respM != RESPONSE_OK: return None encodedStr = respJson["J"] decodedStr = base64.b64decode(encodedStr) # print("decodedStr=%s" % decodedStr) decompressedStr = gzip.decompress(decodedStr) # print("decompressedStr=%s" % decompressedStr) decompressedStrUnicode = decompressedStr.decode("UTF-8") # print("decompressedStrUnicode=%s" % decompressedStrUnicode) decompressedJson = json.loads(decompressedStrUnicode) respDataDict = decompressedJson return respDataDict def getBookQuery2Callback(self, response): respUrl = response.url print("respUrl=%s" % respUrl) prevParaDict = response.save print("prevParaDict=%s" % prevParaDict) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) if respData: newOffset = prevParaDict["offset"] + prevParaDict["limit"] prevParaDict["offset"] = newOffset self.getBookQuery2(prevParaDict) bookSeriesList = respData for eachBookSerie in bookSeriesList: print("eachBookSerie=%s" % eachBookSerie) self.getStorybookDetail(eachBookSerie) else: print("!!! %s return no more data: %s" % (response.url, respJson)) def getStorybookDetail(self, bookSerieDict): print("getStorybookDetail: bookSerieDict=%s" % bookSerieDict) seriePrimayKey = bookSerieDict["pk"] urlEndpoint = "/Reading.svc/viewEnglishSeries2/%s/%s" % (gUserId, seriePrimayKey) fullUrl = "%s%s" % (gServerPort, urlEndpoint) # http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2/1134723/31 print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl)) curHeaders = copy.deepcopy(gHeaders) curTimestampInt = getCurTimestamp() curTimestampStr = str(curTimestampInt) curHeaders["timestamp"] = curTimestampStr curSignature = self.generateSignature(curTimestampInt, urlEndpoint) curHeaders["signature"] = curSignature self.crawl(fullUrl, method="GET", callback=self.getSerieDetailCallback, headers=curHeaders, save=bookSerieDict ) def getSerieDetailCallback(self, response): respUrl = response.url print("respUrl=%s" % respUrl) bookSerieDict = response.save print("bookSerieDict=%s" % bookSerieDict) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) respDict = respData[0] # respDict["url"] = response.url return respDict
返回selfReadingBookQuery2和parentChildReadingBookQuery2的数据:
以及每个series的详情了:
不过,去批量运行后,始终只有17个url,就停了:
而不是以为的,有几十几百个url,以及能继续运行。
找了半天原因,终于想到是:
即使每次offset变了,json参数变了,由于两个post的url一直没变:
SelfReadingUrl = " http://www.xiaohuasheng.cn:83/Reading.svc/selfReadingBookQuery2 " ParentChildReadingUrl = " http://www.xiaohuasheng.cn:83/Reading.svc/parentChildReadingBookQuery2 "
导致后续不继续运行了。
所以参考自己的:
去加上#hash #someDiffValue,试试。
代码:
# add hash value for url to force re-crawl when POST url not changed timestampStr = datetime.now().strftime("%Y%m%d_%H%M%S_%f") curUrlWithHash = curUrl + "#" + timestampStr # fakeItagForceRecrawl = "%s_%s_%s" % (timestampStr, offset, limit) self.crawl(curUrlWithHash, # itag=fakeItagForceRecrawl, # To force re-crawl for next page method="POST", # data=jcJsonDict, data= jcJsonDictStr, # callback=curCallback, callback=self.getBookQuery2Callback, headers=curHeaders, save=curParamDict )
调试效果:
/selfReadingBookQuery2#20190409_162018_413205
批量运行试试,能否继续爬取所有数据。
是可以继续爬取后续数据的:
不过只有117个:
也不太对的感觉
->因为之前:
- 自主阅读:50多个
- 亲子阅读:90多个
的绘本系列,加起来应该有140才对
或许是部分url重复了?好像是的。
加起来去重后,看来是117个绘本。
【总结】
此处是通过给url加上#hash,保证每次hash值不同,使得post同样的url#hash,url值不同,得以继续爬取剩余数据。
转载请注明:在路上 » 【已解决】PySpider无法继续爬取剩余绘本数据