折腾:
【记录】爬虫 爬数据 义务教育教科书 义教教科书
期间,继续去看看另外2本电子书:
去chrome中打开,调试看看。
这个是封面:
之后是其他几张图片:
最开始的几张图片,都预先加载了。
去搜索这些值怎么出来的。
搜封面图片的:
6956499_4689C78AA4AF40C6DFC8ACA2AABAB849
找到:
Request URL: https://biz.bookln.cn/ebookpageservices/queryAllPageByEbookId.do
返回的json,太长,此处拷贝出来,再去格式化后是:
摘录其中一部分:
{ "data": { "data": [{ "ebookId": 52365, "gmtCreate": 1582279296000, "gmtModified": 1582279296000, "id": 2578316, "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_4689C78AA4AF40C6DFC8ACA2AABAB849.png", "isDelete": 0, "pageNo": 1, "status": 1, "userId": 6956499, "userName": "荣德基教育:马强" }, { "ebookId": 52365, "gmtCreate": 1582279296000, "gmtModified": 1582279296000, "id": 2578317, "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_079BF84EAA676ED226D5B60E57B3B6BC.png", "isDelete": 0, "pageNo": 2, "status": 1, "userId": 6956499, "userName": "荣德基教育:马强" }, { "ebookId": 52365, "gmtCreate": 1582279296000, "gmtModified": 1582279296000, "id": 2578318, "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_75158EE94FCFEB6AA46FB6E326D4EA1C.png", "isDelete": 0, "pageNo": 3, "status": 1, "userId": 6956499, "userName": "荣德基教育:马强" }, ... { "ebookId": 52365, "gmtCreate": 1582279297000, "gmtModified": 1582279297000, "id": 2578614, "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_4D7D4F4BBBE81A0EE2A808D42CD676AF.png", "isDelete": 0, "pageNo": 299, "status": 1, "userId": 6956499, "userName": "荣德基教育:马强" }], "ebookConf": { "ebookId": 52365, "gmtCreate": xxx68000, "gmtModified": xxx68000, "id": 32364, "pageVoice": 1 }, "onlineStatus": 1, "description": "2020春 初中点拨 八年级英语(R版)", "thumbnails": "http://cdn11.bookln.cn/6956499_BFC33BDB77F6643454E86B8318EA281E.jpeg", "bookName": "2020春 初中点拨 八年级英语(R版)", "userId": 6956499 }, "success": true }
所以,去获取json,解析后,再挨个下载图片,同时保存图片名为pageNo的值
【未解决】模拟mp.codeup.cn中调用queryAllPageByEbookId.do返回json数据
暂时没把js代码转python。
所以只能是:
直接把Chrome调试得到json去处理和下载
# Function: # 电子样书 点拨 八年级英语下 # http://mp.codeup.cn/book/sample2.htm?id=52365&shelfId=4824&share_=6765370&sh=sh&vt_=1583111113754&_logined=1 # # 电子样书 点拔训练 八年级英语下 # http://mp.codeup.cn/book/sample2.htm?id=52489&shelfId=4822&share_=6765370&sh=sh&vt_=1583111131475 # 的图片 # Author: Crifan Li # Update: 20200303 import os import json # import copy import codecs import requests gBookIdList = [ "52365", "52489", ] UserAgent_Mac_Chrome = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36" gHeaders = { "User-Agent": UserAgent_Mac_Chrome, "origin": "http://mp.codeup.cn", } gSaveFolder = os.path.join("output", "mp.codeup.cn") gInputFolder = os.path.join("input", "mp.codeup.cn", "queryAllPageByEbookId_resp") def createFolder(folderFullPath): """ create folder, even if already existed Note: for Python 3.2+ """ os.makedirs(folderFullPath, exist_ok=True) createFolder(gSaveFolder) # for eachBookId in gBookIdList: # getAllPageUrl = "https://biz.bookln.cn/ebookpageservices/queryAllPageByEbookId.do" # curHeaders = copy.deepcopy(gHeaders) # curHeaders["Content-Type"] = "application/x-www-form-urlencoded" # curHeaders["Accept"] = "application/json, text/javascript, */*; q=0.01" # curHeaders["referer"] = "http://mp.codeup.cn/book/sample2.htm?id=%s" % eachBookId # curHeaders["sec-fetch-dest"] = "empty" # curHeaders["sec-fetch-mode"] = "cors" # curHeaders["sec-fetch-site"] = "cross-site" # postDict = { # "ebookId": eachBookId # } # resp = requests.post(getAllPageUrl, headers=curHeaders, data=postDict) # print("resp=%s" % resp) def loadJsonFromFile(fullFilename, fileEncoding="utf-8"): """load and parse json dict from file""" with codecs.open(fullFilename, 'r', encoding=fileEncoding) as jsonFp: jsonDict = json.load(jsonFp) # logging.debug("Complete load json from %s", fullFilename) return jsonDict for eachBookId in gBookIdList: print("%s bookId=%s %s" % ('-'*30, eachBookId, '-'*30)) curOutputFolder = os.path.join(gSaveFolder, eachBookId) createFolder(curOutputFolder) curJsonFile = "%s.json" % eachBookId curJsonFullPath = os.path.join(gInputFolder, curJsonFile) curBookJsonDict = loadJsonFromFile(curJsonFullPath) dataDict = curBookJsonDict["data"] bookName = dataDict["bookName"] ebookConf = dataDict["ebookConf"] pageDictList = dataDict["data"] for eachPageDict in pageDictList: """ { "ebookId": 52365, "gmtCreate": 1582279297000, "gmtModified": 1582279297000, "id": 2578614, "imgurl": "https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52365/6956499_4D7D4F4BBBE81A0EE2A808D42CD676AF.png", "isDelete": 0, "pageNo": 299, "status": 1, "userId": 6956499, "userName": "荣德基教育:马强" } """ ebookId = eachPageDict["ebookId"] imgurl = eachPageDict["imgurl"] print("imgurl=%s" % imgurl) imgSuffix = imgurl.split(".")[-1] pageNo = eachPageDict["pageNo"] saveFilename = "%s_%03d.%s" % (ebookId, pageNo, imgSuffix) saveFullPath = os.path.join(curOutputFolder, saveFilename) if os.path.exists(saveFullPath): print("existed: %s" % saveFullPath) else: resp = requests.get(imgurl, headers=gHeaders) if resp.ok: with open(saveFullPath, 'wb') as saveFp: saveFp.write(resp.content) print(" Saved to %s" % saveFullPath)
即可下载到:
... imgurl=https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52489/6956499_AFD4EC4CD78BAA02A1179DEC7F9BEC27.png existed: output/mp.codeup.cn/52489/52489_181.png imgurl=https://yuntisyscdn.bookln.cn/server/ebook/pdf/bookln/52489/6956499_40DE1350C9DA75781C7F9E694A55FE15.png existed: output/mp.codeup.cn/52489/52489_182.png
一堆图片:
转载请注明:在路上 » 【已解决】爬取mp.codeup.cn中的英语教材电子书资源