折腾:
期间,已经大概分析了绘本数据的相关api。
现在需要去写代码模拟下载数据。
先确保第一个获取绘本列表的api能够正常获取数据。
期间:
继续写代码。
结果:
接着就可以写代码去爬取数据了。
接着又遇到:
【已解决】PySpider模拟小花生app请求parentChildReadingBookQuery2返回空数据
以及:
【已解决】小花生app中调用接口parentChildReadingBookQuery2时timestamp和signature生成的逻辑
然后接着去模拟剩下的selfReadingBookQuery2
然后继续参考:
【已解决】用Charles+Postman+Python解密脚本分析小花生app中绘本接口和返回信息
实现剩余的api请求。
然后基本上写好了代码,且也优化好了:
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2019-03-27 15:35:20 # Project: XiaohuashengApp from pyspider.libs.base_handler import * import os import json import codecs import base64 import gzip import copy import time import re # import datetime from datetime import datetime, timedelta from hashlib import md5 ###################################################################### # Const ###################################################################### gServerPort = "http://www.xiaohuasheng.cn:83" gResourcesRoot = "https://img.xiaohuasheng.cn" SelfReadingUrl = "http://www.xiaohuasheng.cn:83/Reading.svc/selfReadingBookQuery2" ParentChildReadingUrl = "http://www.xiaohuasheng.cn:83/Reading.svc/parentChildReadingBookQuery2" # ViewEnglishSeries2UrlPrefix = "http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2" RESPONSE_OK = "1001" ###################################################################### # Config & Settings ###################################################################### OutputFolder = "/Users/crifan/dev/dev_root/company/xxx/projects/crawler_projects/crawler_xiaohuasheng_app/output" DefaultPageSize = 10 gUserAgentNoxAndroid = "Mozilla/5.0 (Linux; U; Android 4.4.2; zh-cn; A0001 Build/KOT49H) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1" gUserId = "1134723" gAuthorization = """NSTp9~)NwSfrXp@\\""" gUserToken = "40d2267f-359e-4526-951a-66519e5868c3" gSecretKey = “AyGt7ohMR!xx#N" gHeaders = { "Host": "www.xiaohuasheng.cn:83", "User-Agent": gUserAgentNoxAndroid, "Content-Type": "application/json", "userId": gUserId, "Authorization": gAuthorization, # "timestamp": gTimestamp, # "signature": gSignature, "cookie": "ASP.NET_SessionId=dxf3obxgn5t4w350xp3icgy0", # "Cookie2": "$Version=1", "Accept": "*/*", "Accept-Encoding": "gzip, deflate", "cache-control": "no-cache", "Connection": "keep-alive", # "content-length": "202", } gParamLevelAll = -1 gFixParam1 = 1 gLongitude = "120.136174" gLatitude = "28.997280" gFixParam2 = 10 ###################################################################### # Common Util Functions ###################################################################### def getCurTimestamp(withMilliseconds=False): """ get current time's timestamp (default)not milliseconds -> 10 digits: 1351670162 with milliseconds -> 13 digits: 1531464292921 """ curDatetime = datetime.now() return datetimeToTimestamp(curDatetime, withMilliseconds) def datetimeToTimestamp(datetimeVal, withMilliseconds=False) : """ convert datetime value to timestamp eg: "2006-06-01 00:00:00.123" -> 1149091200 if with milliseconds -> 1149091200123 :param datetimeVal: :return: """ timetupleValue = datetimeVal.timetuple() timestampFloat = time.mktime(timetupleValue) # 1531468736.0 -> 10 digits timestamp10DigitInt = int(timestampFloat) # 1531468736 timestampInt = timestamp10DigitInt if withMilliseconds: microsecondInt = datetimeVal.microsecond # 817762 microsecondFloat = float(microsecondInt)/float(1000000) # 0.817762 timestampFloat = timestampFloat + microsecondFloat # 1531468736.817762 timestampFloat = timestampFloat * 1000 # 1531468736817.7621 -> 13 digits timestamp13DigitInt = int(timestampFloat) # 1531468736817 timestampInt = timestamp13DigitInt return timestampInt def extractSuffix(fileNameOrUrl): """ extract file suffix from name or url eg: https://cdn2.xxx.cn/2018-09-10/15365514898246.mp4 -> mp4 15365514894833.srt -> srt """ return fileNameOrUrl.split('.')[-1] def createFolder(folderFullPath): """ create folder, even if already existed Note: for Python 3.2+ """ os.makedirs(folderFullPath, exist_ok=True) print("Created folder: %s" % folderFullPath) def saveDataToFile(fullFilename, binaryData): """save binary data info file""" with open(fullFilename, 'wb') as fp: fp.write(binaryData) fp.close() print("Complete save file %s" % fullFilename) def saveJsonToFile(fullFilename, jsonValue): """save json dict into file""" with codecs.open(fullFilename, 'w', encoding="utf-8") as jsonFp: json.dump(jsonValue, jsonFp, indent=2, ensure_ascii=False) print("Complete save json %s" % fullFilename) def loadJsonFromFile(fullFilename): """load and parse json dict from file""" with codecs.open(fullFilename, 'r', encoding="utf-8") as jsonFp: jsonDict = json.load(jsonFp) print("Complete load json from %s" % fullFilename) return jsonDict ###################################################################### # Project Specific Functions ###################################################################### def getSeriesFolder(seriesId): return os.path.abspath(os.path.join(OutputFolder, "series", str(seriesId))) def getSeriesAudioPackagesFolder(seriesId): return os.path.abspath(os.path.join(getSeriesFolder(seriesId), "AudioPackages")) def getSeriesBooksFolder(seriesId): return os.path.abspath(os.path.join(getSeriesFolder(seriesId), "Books")) def getSingleAudioPackageFolder(seriesId, audioPackageId): return os.path.abspath(os.path.join(getSeriesAudioPackagesFolder(seriesId), str(audioPackageId))) def getSingleAudioFolder(seriesId, audioPackageId, audioId): return os.path.abspath(os.path.join(getSingleAudioPackageFolder(seriesId, audioPackageId), str(audioId))) def getSingleBookFolder(seriesId, bookId): return os.path.abspath(os.path.join(getSeriesBooksFolder(seriesId), str(bookId))) ###################################################################### # Main ###################################################################### class Handler(BaseHandler): crawl_config = { } #---------------------------------------- # Util Functions #---------------------------------------- def downloadFileCallback(self, response): fileInfo = response.save print("fileInfo=%s" % fileInfo) binData = response.content fileFullPath = os.path.join(fileInfo["saveFolder"], fileInfo["filename"]) print("fileFullPath=%s" % fileFullPath) saveDataToFile(fileFullPath, binData) def downloadFile(self, fileInfo): urlToDownload = fileInfo["fileUrl"] print("urlToDownload=%s" % urlToDownload) self.crawl(urlToDownload, callback=self.downloadFileCallback, save=fileInfo) def generateSignature(self, timestampInt, jValueOrUrlEndpoint): # print("generateSignature: timestampInt=%d, jValueOrUrlEndpoint=%s" % (timestampInt, jValueOrUrlEndpoint)) # userId = "1134723" userId = gUserId timestamp = "%s" % timestampInt # localObject = "/Reading.svc/parentChildReadingBookQuery2" # localObject = jValueOrUrlEndpoint # userToken = "40d2267f-359e-4526-951a-66519e5868c3" userToken = gUserToken # fixedSault = “AyGt7ohMR!xx#N" # secretKey = “AyGt7ohMR!xx#N" secretKey = gSecretKey # strToCalc = userId + timestamp + localObject + jValueOrUrlEndpoint + fixedSault # strToCalc = timestamp + localObject + fixedSault strToCalc = userId + timestamp + jValueOrUrlEndpoint + userToken + secretKey # print("strToCalc=%s" % strToCalc) encodedStr = strToCalc.encode() # encodedStr = strToCalc.encode("UTF-8") # print("encodedStr=%s" % encodedStr) md5Result = md5(encodedStr) # print("md5Result=%s" % md5Result) # md5Result=<md5 HASH object @ 0x1044f1df0> # md5Result = md5() # md5Result.update(strToCalc) # md5Digest = md5Result.digest() # print("md5Digest=%s" % md5Digest) # # print("len(md5Digest)=%s" % len(md5Digest)) md5Hexdigest = md5Result.hexdigest() # print("md5Hexdigest=%s" % md5Hexdigest) # print("len(md5Hexdigest)=%s" % len(md5Hexdigest)) # md5Hexdigest=c687d5dfa015246e6bdc6b3c27c2afea # print("md5=%s from %s" % (md5Hexdigest, strToCalc)) return md5Hexdigest # return md5Digest def extractResponseData(self, respJson): """ { "C": 2, "J": "H4sIAA.......AA=", "M": "1001", "ST": null } """ # respJson = json.loads(respJson) respM = respJson["M"] if respM != RESPONSE_OK: return None encodedStr = respJson["J"] decodedStr = base64.b64decode(encodedStr) # print("decodedStr=%s" % decodedStr) decompressedStr = gzip.decompress(decodedStr) # print("decompressedStr=%s" % decompressedStr) decompressedStrUnicode = decompressedStr.decode("UTF-8") # print("decompressedStrUnicode=%s" % decompressedStrUnicode) decompressedJson = json.loads(decompressedStrUnicode) respDataDict = decompressedJson return respDataDict def generateCurrentHeaders(self, jValueOrUrlEndpoint): curHeaders = copy.deepcopy(gHeaders) curTimestampInt = getCurTimestamp() curTimestampStr = str(curTimestampInt) curHeaders["timestamp"] = curTimestampStr curSignature = self.generateSignature(curTimestampInt, jValueOrUrlEndpoint) curHeaders["signature"] = curSignature return curHeaders def dictValueStrToJson(self, originDict): """ auto detect json filed name is xxxJson or xxxArrayJson, then convert json str to dict/json """ processedDict = originDict if isinstance(processedDict, dict): firstLevelKeys = processedDict.keys() for eachFieldName in firstLevelKeys: isArrayJson = re.match(r"\w+ArrayJson$", eachFieldName) isJson = re.match(r"\w+Json$", eachFieldName) # print("isArrayJson=%s, isJson=%s" % (isArrayJson, isJson)) if isArrayJson or isJson: fieldValueJsonStr = processedDict[eachFieldName] # print("%s -> fieldValueJsonStr=%s" % (eachFieldName, fieldValueJsonStr)) if fieldValueJsonStr: fieldValueDict = json.loads(fieldValueJsonStr) else: fieldValueDict = None fieldValueDict = self.dictValueStrToJson(fieldValueDict) processedDict[eachFieldName] = fieldValueDict elif isinstance(originDict, list): newList = [] for eachItem in originDict: processedItem = self.dictValueStrToJson(eachItem) newList.append(processedItem) processedDict = newList return processedDict #---------------------------------------- # Crawl Logic #---------------------------------------- def on_start(self): jValueTemplateSelfReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"grades\":\"\",\"levels\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}" jValueTemplateParentChildReading = "{\"userId\":\"%s\",\"fieldName\":\"\",\"fieldValue\":\"全部类别\",\"theStageOfTheChild\":\"\",\"parentalEnglishLevel\":\"\",\"supportingResources\":\"有音频\",\"offset\":%d,\"limit\":%d}" paramDictSelfReading = { "curUrl": SelfReadingUrl, "offset": 0, "limit": DefaultPageSize, "jValueTemplate": jValueTemplateSelfReading } self.getBookQuery2(paramDictSelfReading) paramDictParentChildReading = { "curUrl": ParentChildReadingUrl, "offset": 0, "limit": DefaultPageSize, "jValueTemplate": jValueTemplateParentChildReading } self.getBookQuery2(paramDictParentChildReading) def getBookQuery2(self, curParamDict): print("getBookQuery2: curParamDict=%s" % curParamDict) curUrl = curParamDict["curUrl"] jValueTemplate = curParamDict["jValueTemplate"] offset = curParamDict["offset"] limit = curParamDict["limit"] jValueStr = jValueTemplate % (gUserId, offset, limit) jcJsonDict = { "J": jValueStr, "C": 0 } jcJsonDictStr = json.dumps(jcJsonDict) curParamDict["jValueStr"] = jValueStr curParamDict["jcJsonDict"] = jcJsonDict curParamDict["jcJsonDictStr"] = jcJsonDictStr curHeaders = self.generateCurrentHeaders(jValueStr) # add hash value for url to force re-crawl when POST url not changed timestampStr = datetime.now().strftime("%Y%m%d_%H%M%S_%f") curUrlWithHash = curUrl + "#" + timestampStr fakeItagForceRecrawl = "%s_%s_%s" % (timestampStr, offset, limit) self.crawl(curUrlWithHash, itag=fakeItagForceRecrawl, # To force re-crawl for next page method="POST", # data=jcJsonDict, data= jcJsonDictStr, # callback=curCallback, callback=self.getBookQuery2Callback, headers=curHeaders, save=curParamDict ) def getBookQuery2Callback(self, response): respUrl = response.url print("respUrl=%s" % respUrl) prevParaDict = response.save print("prevParaDict=%s" % prevParaDict) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) if respData: newOffset = prevParaDict["offset"] + prevParaDict["limit"] prevParaDict["offset"] = newOffset self.getBookQuery2(prevParaDict) bookSeriesList = respData for eachBookSerie in bookSeriesList: print("eachBookSerie=%s" % eachBookSerie) self.getStorybookDetail(eachBookSerie) else: print("!!! %s return no more data: %s" % (response.url, respJson)) def getStorybookDetail(self, bookSerieDict): print("getStorybookDetail: bookSerieDict=%s" % bookSerieDict) seriePrimayKey = bookSerieDict["pk"] urlEndpoint = "/Reading.svc/viewEnglishSeries2/%s/%s" % (gUserId, seriePrimayKey) fullUrl = "%s%s" % (gServerPort, urlEndpoint) # http://www.xiaohuasheng.cn:83/Reading.svc/viewEnglishSeries2/1134723/31 print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl)) curHeaders = self.generateCurrentHeaders(urlEndpoint) self.crawl(fullUrl, method="GET", callback=self.getSerieDetailCallback, headers=curHeaders, # save=bookSerieDict ) def getSerieDetailCallback(self, response): respUrl = response.url print("respUrl=%s" % respUrl) # bookSerieDict = response.save # print("bookSerieDict=%s" % bookSerieDict) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) respDict = respData[0] # respDict["url"] = response.url # return respDict bookSeriesDict = respDict seriesId = bookSeriesDict["pk"] self.saveSeriesInfo(bookSeriesDict) # get audio audioPackagesParamDict = { "seriesId": seriesId, "level": gParamLevelAll, "fixParam1": gFixParam1, "offset": 0, "limit": DefaultPageSize } self.getSeriesAudioPackages(audioPackagesParamDict) # get book info bookParamDict = { "seriesId": seriesId, "level": gParamLevelAll, "offset": 0, "limit": DefaultPageSize } self.getSeriesBook(bookParamDict) def getSeriesBook(self, paramDict): urlEndpoint = "/Reading.svc/queryEnglishSeriesBook/%s/%s/%s/%s/%s" % \ (gUserId, paramDict["seriesId"], paramDict["level"], paramDict["offset"], paramDict["limit"]) print("urlEndpoint=%s" % urlEndpoint) fullUrl = "%s%s" % (gServerPort, urlEndpoint) # http://www.xiaohuasheng.cn:83/Reading.svc/queryEnglishSeriesBook/1134723/31/-1/0/10 curHeaders = self.generateCurrentHeaders(urlEndpoint) self.crawl(fullUrl, method="GET", callback=self.getSeriesBookCallback, headers=curHeaders, save=paramDict, ) def getSeriesBookCallback(self, response): respUrl = response.url print("respUrl=%s" % respUrl) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) if respData: prevParamDict = response.save curParamDict = prevParamDict curParamDict["offset"] += curParamDict["limit"] self.getSeriesBook(curParamDict) seriesId = curParamDict["seriesId"] seriesBookList = respData print("seriesBookList=%s" % seriesBookList) for eachBookDict in seriesBookList: print("eachBookDict=%s" % eachBookDict) curBookId = eachBookDict["pk"] self.getSingleBookInfo(seriesId, curBookId) else: print("!!! %s return no more data: %s" % (response.url, respJson)) def getSingleBookInfo(self, seriesId, curBookId): urlEndpoint = "/Reading.svc/getServerBookInfo17/%s/%s/%s/%s/%s" % \ (gUserId, gLongitude, gLatitude, curBookId, gFixParam2) print("urlEndpoint=%s" % urlEndpoint) fullUrl = "%s%s" % (gServerPort, urlEndpoint) # http://www.xiaohuasheng.cn:83/Reading.svc/getServerBookInfo17/1134723/120.136174/28.997280/109512/10 curHeaders = self.generateCurrentHeaders(urlEndpoint) self.crawl(fullUrl, method="GET", callback=self.getSingleBookInfoCallback, headers=curHeaders, save=seriesId, ) def getSingleBookInfoCallback(self, response): seriesId = response.save print("seriesId=%s" % seriesId) respUrl = response.url print("respUrl=%s" % respUrl) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) bookInfoDict = respData[0] print("bookInfoDict=%s" % bookInfoDict) self.saveSingleBookInfo(seriesId, bookInfoDict) def saveSingleBookInfo(self, seriesId, bookInfoDict): # curSeriesBooksFolder = getSeriesBooksFolder(seriesId) # print("curSeriesBooksFolder=%s" % curSeriesBooksFolder) # createFolder(curSeriesBooksFolder) bookId = bookInfoDict["pk"] singleBooksFolder = getSingleBookFolder(seriesId, bookId) print("singleBooksFolder=%s" % singleBooksFolder) createFolder(singleBooksFolder) singleBookFilename = "series_%s_Books_%s_info.json" % (seriesId, bookId) singleBookFullPath = os.path.abspath(os.path.join(singleBooksFolder, singleBookFilename)) bookInfoDict = self.dictValueStrToJson(bookInfoDict) saveJsonToFile(singleBookFullPath, bookInfoDict) # download and save: frontCover # "frontCover": "149/Book/20160930171033.png", coverImageUrlTail = bookInfoDict["frontCover"] if coverImageUrlTail: coverImageFilename = ("Books_%s_" % bookId) + coverImageUrlTail.replace("/", "_") imageFileInfo = { "fileUrl": gResourcesRoot + "/" + coverImageUrlTail, "filename": coverImageFilename, "saveFolder": singleBooksFolder, } self.downloadFile(imageFileInfo) def saveSeriesInfo(self, bookSeriesDict): seriesId = bookSeriesDict["pk"] curSeriesFolder = getSeriesFolder(seriesId) print("curSeriesFolder=%s" % curSeriesFolder) createFolder(curSeriesFolder) filenamePrefix = "series_%s" % seriesId seriesFilename = "%s_info.json" % filenamePrefix seriesFullPath = os.path.abspath(os.path.join(curSeriesFolder, seriesFilename)) bookSeriesDict = self.dictValueStrToJson(bookSeriesDict) saveJsonToFile(seriesFullPath, bookSeriesDict) # download series cover image """ /series/623/series_623_info.json { "pk": 623, "englishTitle": "Peppa Pig", "chineseTitle": "小猪佩奇绘本集", "picture": "System/EnglishSeriesPicture/20190114112209525.jpg", ... /series/158/series_158_info.json { "pk": 158, "englishTitle": "An Elephant and Piggie Book", "chineseTitle": "小猪小象绘本系列", ... "picture": "", "lessonPlanFirstPictureUrl": "https://img.xiaohuasheng.cn/20180911145347266_80f5f443a43bb430663a71b381cde40e.jpg", """ fileUrl = None coverImageUrlTail = bookSeriesDict["picture"] lessonPlanFirstPictureUrl = bookSeriesDict["lessonPlanFirstPictureUrl"] if coverImageUrlTail: coverImageFilename = filenamePrefix + coverImageUrlTail.replace("/", "_") fileUrl = gResourcesRoot + "/" + coverImageUrlTail elif lessonPlanFirstPictureUrl: coverImageFilename = filenamePrefix + "_" + lessonPlanFirstPictureUrl.split("/")[-1] fileUrl = lessonPlanFirstPictureUrl if fileUrl: imageFileInfo = { "fileUrl": fileUrl, "filename": coverImageFilename, "saveFolder": curSeriesFolder, } self.downloadFile(imageFileInfo) def getSeriesAudioPackages(self, paramDict): urlEndpoint = "/Reading.svc/queryEnglishSeriesAudio/%s/%s/%s/%s/%s/%s" % \ (gUserId, paramDict["seriesId"], paramDict["level"], paramDict["fixParam1"], paramDict["offset"], paramDict["limit"]) print("urlEndpoint=%s" % urlEndpoint) fullUrl = "%s%s" % (gServerPort, urlEndpoint) # http://www.xiaohuasheng.cn:83/Reading.svc/queryEnglishSeriesAudio/1134723/31/-1/1/0/10 """ http://www.xiaohuasheng.cn:83/Reading.svc/getLevelForQueryEnglishSeriesAudio/1134723/31 return english series level: [ { "pk": -1, "name": "全部" }, { "pk": 79, "name": "Level 1" }, { "pk": 80, "name": "Level 2" }, { "pk": 81, "name": "Level 3" } ] """ curHeaders = self.generateCurrentHeaders(urlEndpoint) self.crawl(fullUrl, method="GET", callback=self.getSeriesAudioPackagesCallback, headers=curHeaders, save=paramDict, ) def saveSeriesAudioPackagesInfo(self, seriesAudioPackagesInfo): print("saveSeriesAudioPackagesInfo: seriesAudioPackagesInfo=%s" % seriesAudioPackagesInfo) seriesId = seriesAudioPackagesInfo["seriesId"] curAudioPackagesFolder = getSeriesAudioPackagesFolder(seriesId) print("curAudioPackagesFolder=%s" % curAudioPackagesFolder) if not os.path.exists(curAudioPackagesFolder): createFolder(curAudioPackagesFolder) audioPackagesFilename = "series_%s_AudioPackages_info.json" % seriesId print("audioPackagesFilename=%s" % audioPackagesFilename) audioPackagesFullPath = os.path.abspath(os.path.join(curAudioPackagesFolder, audioPackagesFilename)) if os.path.exists(audioPackagesFullPath): print("alreay existed %s" % audioPackagesFullPath) # append prevAudioPackagesInfo = loadJsonFromFile(audioPackagesFullPath) prevSeriesId = prevAudioPackagesInfo["seriesId"] if prevSeriesId != seriesId: print("!!! Unexpected not same id for saving series audio info, old=%s, new=%s" % (prevSeriesId, seriesId)) else: newAudioPackagesInfo = prevAudioPackagesInfo newAudioPackagesInfo["AudioPackages"].extend(seriesAudioPackagesInfo["AudioPackages"]) saveJsonToFile(audioPackagesFullPath, newAudioPackagesInfo) else: print("not existed %s" % audioPackagesFullPath) # write saveJsonToFile(audioPackagesFullPath, seriesAudioPackagesInfo) def getSeriesAudioPackagesCallback(self, response): respUrl = response.url print("respUrl=%s" % respUrl) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) if respData: prevParamDict = response.save curParamDict = prevParamDict curParamDict["offset"] += curParamDict["limit"] self.getSeriesAudioPackages(curParamDict) seriesAudioPackagesList = respData seriesId = curParamDict["seriesId"] seriesAudioPackagesInfo = { "seriesId": seriesId, "AudioPackages": seriesAudioPackagesList } self.saveSeriesAudioPackagesInfo(seriesAudioPackagesInfo) print("seriesAudioPackagesList=%s" % seriesAudioPackagesList) for eachAudioPackageDict in seriesAudioPackagesList: print("eachAudioPackageDict=%s" % eachAudioPackageDict) audioPackageId = eachAudioPackageDict["pk"] self.getAudioPackage(seriesId, audioPackageId) else: print("!!! %s return no more data: %s" % (response.url, respJson)) def getAudioPackage(self, seriesId, audioPackageId): urlEndpoint = "/Reading.svc/viewAudioPackage/%s/%s/%s" % (gUserId, audioPackageId, gFixParam1) fullUrl = "%s%s" % (gServerPort, urlEndpoint) # http://www.xiaohuasheng.cn:83/Reading.svc/viewAudioPackage/1134723/1808/1 print("urlEndpoint=%s, fullUrl=%s" % (urlEndpoint, fullUrl)) curHeaders = self.generateCurrentHeaders(urlEndpoint) self.crawl(fullUrl, method="GET", callback=self.getAudioPackageCallback, headers=curHeaders, save=seriesId ) def getAudioPackageCallback(self, response): seriesId = response.save print("seriesId=%s" % seriesId) respUrl = response.url print("respUrl=%s" % respUrl) respJson = response.json print("respJson=%s" % respJson) respData = self.extractResponseData(respJson) print("respData=%s" % respData) audioPackageDict = respData[0] print("audioPackageDict=%s" % audioPackageDict) self.saveSingleAudioPackageInfo(seriesId, audioPackageDict) audioArrayJsonStr = audioPackageDict["audioArrayJson"] print("audioArrayJsonStr=%s" % audioArrayJsonStr) audioPackageId = audioPackageDict["pk"] # audioArrayDictList = json.loads(audioArrayJsonStr) audioArrayDictList = audioArrayJsonStr print("audioArrayDictList=%s" % audioArrayDictList) for singleAudioDict in audioArrayDictList: print("singleAudioDict=%s" % singleAudioDict) singleAudioDict["seriesId"] = seriesId singleAudioDict["audioPackageId"] = audioPackageId self.saveSingleAudio(singleAudioDict) def saveSingleAudioPackageInfo(self, seriesId, audioPackageInfo): audioPackageId = audioPackageInfo["pk"] curSingleAudioPackageFolder = getSingleAudioPackageFolder(seriesId, audioPackageId) print("curSingleAudioPackageFolder=%s" % curSingleAudioPackageFolder) createFolder(curSingleAudioPackageFolder) filenamePrefix = "series_%s_AudioPackages_%s" % (seriesId, audioPackageId) singleAudioPackageFilename = "%s_info.json" % (filenamePrefix) singleAudioPackageFullPath = os.path.abspath(os.path.join(curSingleAudioPackageFolder, singleAudioPackageFilename)) audioPackageInfo = self.dictValueStrToJson(audioPackageInfo) saveJsonToFile(singleAudioPackageFullPath, audioPackageInfo) # download bookSeriesPicture # case 1: # "bookSeriesPicture": "EnglishLevelFrontCoverOrInnerPage/79/封面.jpg", # coverImageUrlTail = audioPackageInfo["bookSeriesPicture"] # case 2: # "picture": "attached/image/20190114/20190114103636_2075.jpg", # "bookSeriesPicture": "", coverImageUrlTail = audioPackageInfo["picture"] print("coverImageUrlTail=%s" % coverImageUrlTail) if coverImageUrlTail: imageSuffix = coverImageUrlTail.split(".")[-1] imageFileInfo = { "fileUrl": gResourcesRoot + "/" + coverImageUrlTail, "filename": "%s_coverImage.%s" % (filenamePrefix, imageSuffix), "saveFolder": curSingleAudioPackageFolder, } self.downloadFile(imageFileInfo) def saveSingleAudio(self, singleAudioDict): seriesId = singleAudioDict["seriesId"] audioPackageId = singleAudioDict["audioPackageId"] audioId = singleAudioDict["pk"] curSingleAudioFolder = getSingleAudioFolder(seriesId, audioPackageId, audioId) print("curSingleAudioFolder=%s" % curSingleAudioFolder) createFolder(curSingleAudioFolder) filenamePrefix = "series_%s_AudioPackages_%s_audio_%s" % (seriesId, audioPackageId, audioId) singleAudioFilename = "%s_info.json" % (filenamePrefix) singleAudioFullPath = os.path.abspath(os.path.join(curSingleAudioFolder, singleAudioFilename)) saveJsonToFile(singleAudioFullPath, singleAudioDict) """ { "pk": 6497, "picture": "EnglishLevelFrontCoverOrInnerPage/79/封面.jpg", "path": "Audio/1808/20180911222508831.mp3", "extension": ".mp3", "title": "1. Bear Hugs-Listen and Repeat", "size": 1735488, "duration": 433, "sizeString": "1.7M", "durationString": "07:13", "packageName": "Bear Hugs ", "seriesId": 31, "audioPackageId": 1808 } """ # download audio file # "path": "Audio/1808/20180911222516379.mp3", audioFileUrlTail = singleAudioDict["path"] print("audioFileUrlTail=%s" % audioFileUrlTail) if audioFileUrlTail: audioFileInfo = { "fileUrl": gResourcesRoot + "/" + audioFileUrlTail, "filename": ("Aduios_%s_" % audioId) + audioFileUrlTail.replace("/", "_"), "saveFolder": curSingleAudioFolder, } self.downloadFile(audioFileInfo)
本地调试时,可以下载到需要的各种文件。
然后去批量运行,结果报错:
【已解决】PySpider运行批量下载时报错:HTTP 599 Operation timed out after milliseconds with out of bytes received