需要实现爬取汽车之家的所有的品牌的车系和车型数据:
https://www.autohome.com.cn/car/
现在去写Python的爬虫。
之前了解有多个爬虫框架,用过Scrapy,貌似有点点复杂。
听说PySpider不错,去试试,主要看中的是“WEB 界面编写”
先去了解对比看看:
pyspider vs scrapy
pyspider 和 scrapy 比较起来有什么优缺点吗? – 知乎
Python爬虫框架–pyspider初体验 – CSDN博客
pyspider 爬虫教程(一):HTML 和 CSS 选择器 | Binuxの杂货铺
爬虫框架Sasila—-乞丐版scrapy+webmagic+pyspider – 后端 – 掘金
还是直接去用吧
pyspider
pyspider是开源强大的python爬虫系统 – pyspider中文网
继续参考:
https://cuiqingcai.com/2652.html
去试试
然后点击Run后,再点击detail_page的Run按钮:
点击web,显示空白啊:
不过点击html,可以看到html:
参考:
继续搞懂
点击回来管理页面:
然后去研究如何使用:
二是又发现另外的问题:
【已解决】pyspider中如何加载汽车之家页面中的更多内容
后来换了Chrome浏览器,继续抓取页面,点击web,就可以看到页面了:
但是很明显和正常的内容相比:
还是缺少了内容
-》估计是js没有加载?
后来好像加载web页面内容也是正常的了。
继续调试
现在已经用代码:
<code>#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-04-22 15:14:42 # Project: autohomeCarData from pyspider.libs.base_handler import * import string class Handler(BaseHandler): crawl_config = { } # @every(minutes=24 * 60) def on_start(self): for eachLetter in list(string.ascii_lowercase): self.crawl("https://www.autohome.com.cn/grade/carhtml/%s.html" % eachLetter, callback=self.gradCarHtmlPage) def gradCarHtmlPage(self, response): picSeriesItemList = response.doc('.rank-list-ul li div a[href*="/pic/series"]').items() # print("len(picSeriesItemList)=%s"%(len(picSeriesItemList))) for each in picSeriesItemList: self.crawl(each.attr.href, callback=self.detail_page) # @config(age=10 * 24 * 60 * 60) def picSeriesPage(self, response): # for each in response.doc('.rank-list-ul li div a[href^="//car.autohome.com.cn/pic/series/"]').items(): for each in response.doc('.rank-list-ul li div a[href*="/pic/series"]').items(): self.crawl(each.attr.href, callback=self.detail_page) @config(priority=2) def detail_page(self, response): # <a href="/pic/series-t/66.html">查看停产车型&nbsp;&gt;</a> # <a class="ckmore" href="/pic/series/588.html">查看在售车型&nbsp;&gt;</a> # <span class="fn-right">&nbsp;</span> fnRightPicSeries = response.doc('.search-pic-tbar .fn-right a[href*="/pic/series"]') print("fnRightPicSeries=", fnRightPicSeries) if fnRightPicSeries: # hrefValue = fnRightPicSeries.attr.href # print("hrefValue=", hrefValue) # fullPicSeriesUrl = "https://car.autohome.com.cn" + hrefValue fullPicSeriesUrl = fnRightPicSeries.attr.href print("fullPicSeriesUrl=", fullPicSeriesUrl) self.crawl(fullPicSeriesUrl, callback=self.detail_page) # contine parse brand data aDictList = [] # for eachA in response.doc('.breadnav a[href^="/"]').items(): for eachA in response.doc('.breadnav a[href*="/pic/"]').items(): eachADict = { "text" : eachA.text(), "href": eachA.attr.href } print("eachADict=", eachADict) aDictList.append(eachADict) print("aDictList=", aDictList) mainBrandDict = aDictList[-1] subBrandDict = aDictList[-2] brandSerieDict = aDictList[-3] print("mainBrandDict=%s, subBrandDict=%s, brandSerieDict=%s"%(mainBrandDict, subBrandDict, brandSerieDict)) dtTextList = [] for eachDt in response.doc("dl.search-pic-cardl dt").items(): dtTextList.append(eachDt.text()) print("dtTextList=", dtTextList) groupCount = len(dtTextList) print("groupCount=", groupCount) for eachDt in response.doc("dl.search-pic-cardl dt").items(): dtTextList.append(eachDt.text()) ddUlEltList = [] for eachDdUlElt in response.doc("dl.search-pic-cardl dd ul").items(): ddUlEltList.append(eachDdUlElt) print("ddUlEltList=", ddUlEltList) fullModelNameList = [] for curIdx in range(groupCount): curGroupTitle = dtTextList[curIdx] print("------[%d] %s" % (curIdx, curGroupTitle)) for eachLiAElt in ddUlEltList[curIdx].items("li a"): # curModelName = eachLiAElt.text() curModelName = eachLiAElt.contents()[0] print("curModelName=", curModelName) curFullModelName = curGroupTitle + " " + curModelName print("curFullModelName=", curFullModelName) fullModelNameList.append(curFullModelName) print("fullModelNameList=", fullModelNameList) allSerieDictList = [] for eachFullModelName in fullModelNameList: curSerieDict = { "品牌": mainBrandDict["text"], "子品牌": subBrandDict["text"], "车系": brandSerieDict["text"], "车型": eachFullModelName } allSerieDictList.append(curSerieDict) print("allSerieDictList=", allSerieDictList) return allSerieDictList </code>
能返回所需要的json对象了:
接着就是去:
如何把结果保存为csv或excel
【已解决】PySpider如何把json结果数据保存到csv或excel文件中
【已解决】PySpider中如何清空之前运行的数据和正在运行的任务
然后刚才由于用了debug模式,还真的是:
对于遇到一个出错的:
很明显,出错了:
<code>track.process 0.53ms Document is empty ibs/base_handler.py", line 155, in _run_func ret = function(*arguments[:len(args) - 1]) File "<autohomeBrandData>", line 19, in gradCarHtmlPage File "/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/pyspider/libs/response.py", line 144, in doc elements = self.etree File "/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/pyspider/libs/response.py", line 160, in etree self._elements = lxml.html.fromstring(self.content) File "/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/lxml/html/__init__.py", line 876, in fromstring doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) File "/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/lxml/html/__init__.py", line 765, in document_fromstring "Document is empty") lxml.etree.ParserError: Document is empty { "exception": "Document is empty", "follows": 0, "logs": "ibs/base_handler.py\", line 155, in _run_func\n ret = function(*arguments[:len(args) - 1])\n File \"<autohomeBrandData>\", line 19, in gradCarHtmlPage\n File \"/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/pyspider/libs/response.py\", line 144, in doc\n elements = self.etree\n File \"/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/pyspider/libs/response.py\", line 160, in etree\n self._elements = lxml.html.fromstring(self.content)\n File \"/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/lxml/html/__init__.py\", line 876, in fromstring\n doc = document_fromstring(html, parser=parser, base_url=base_url, **kw)\n File \"/Users/crifan/.local/share/virtualenvs/AutocarData-xI-iqIq4/lib/python3.6/site-packages/lxml/html/__init__.py\", line 765, in document_fromstring\n \"Document is empty\")\n lxml.etree.ParserError: Document is empty\n", "ok": false, "result": null, "time": 0.0005340576171875 } </code>
此处debug模式,导致后续不继续运行了。
后来又去调试了代码,用如下代码:
<code>#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-04-27 21:53:02 # Project: autohomeBrandData from pyspider.libs.base_handler import * import string import re class Handler(BaseHandler): crawl_config = { } # @every(minutes=24 * 60) def on_start(self): for eachLetter in list(string.ascii_lowercase): self.crawl("https://www.autohome.com.cn/grade/carhtml/%s.html" % eachLetter, callback=self.gradCarHtmlPage) @catch_status_code_error def gradCarHtmlPage(self, response): print("gradCarHtmlPage: response=", response) picSeriesItemList = response.doc('.rank-list-ul li div a[href*="/pic/series"]').items() print("picSeriesItemList=", picSeriesItemList) # print("len(picSeriesItemList)=%s"%(len(picSeriesItemList))) for each in picSeriesItemList: self.crawl(each.attr.href, callback=self.picSeriesPage) @config(priority=2) def picSeriesPage(self, response): # <a href="/pic/series-t/66.html">查看停产车型&nbsp;&gt;</a> # <a class="ckmore" href="/pic/series/588.html">查看在售车型&nbsp;&gt;</a> # <span class="fn-right">&nbsp;</span> fnRightPicSeries = response.doc('.search-pic-tbar .fn-right a[href*="/pic/series"]') print("fnRightPicSeries=", fnRightPicSeries) if fnRightPicSeries: # hrefValue = fnRightPicSeries.attr.href # print("hrefValue=", hrefValue) # fullPicSeriesUrl = "https://car.autohome.com.cn" + hrefValue fullPicSeriesUrl = fnRightPicSeries.attr.href print("fullPicSeriesUrl=", fullPicSeriesUrl) self.crawl(fullPicSeriesUrl, callback=self.picSeriesPage) # contine parse brand data aDictList = [] # for eachA in response.doc('.breadnav a[href^="/"]').items(): for eachA in response.doc('.breadnav a[href*="/pic/"]').items(): eachADict = { "text" : eachA.text(), "href": eachA.attr.href } print("eachADict=", eachADict) aDictList.append(eachADict) print("aDictList=", aDictList) mainBrandDict = aDictList[-3] subBrandDict = aDictList[-2] brandSerieDict = aDictList[-1] print("mainBrandDict=%s, subBrandDict=%s, brandSerieDict=%s"%(mainBrandDict, subBrandDict, brandSerieDict)) dtTextList = [] for eachDt in response.doc("dl.search-pic-cardl dt").items(): dtTextList.append(eachDt.text()) print("dtTextList=", dtTextList) groupCount = len(dtTextList) print("groupCount=", groupCount) for eachDt in response.doc("dl.search-pic-cardl dt").items(): dtTextList.append(eachDt.text()) ddUlEltList = [] for eachDdUlElt in response.doc("dl.search-pic-cardl dd ul").items(): ddUlEltList.append(eachDdUlElt) print("ddUlEltList=", ddUlEltList) modelDetailDictList = [] for curIdx in range(groupCount): curGroupTitle = dtTextList[curIdx] print("------[%d] %s" % (curIdx, curGroupTitle)) for eachLiAElt in ddUlEltList[curIdx].items("li a"): # 1. model name # curModelName = eachLiAElt.text() curModelName = eachLiAElt.contents()[0] curModelName = curModelName.strip() print("curModelName=", curModelName) curFullModelName = curGroupTitle + " " + curModelName print("curFullModelName=", curFullModelName) # 2. model id + carSeriesId + spec url curModelId = "" curSeriesId = "" curModelSpecUrl = "" modelSpecUrlTemplate = "https://www.autohome.com.cn/spec/%s/#pvareaid=2042128" curModelPicUrl = eachLiAElt.attr.href print("curModelPicUrl=", curModelPicUrl) #https://car.autohome.com.cn/pic/series-s32708/3457.html#pvareaid=2042220 foundModelSeriesId = re.search("pic/series-s(?P<curModelId>\d+)/(?P<curSeriesId>\d+)\.html", curModelPicUrl) print("foundModelSeriesId=", foundModelSeriesId) if foundModelSeriesId: curModelId = foundModelSeriesId.group("curModelId") curSeriesId = foundModelSeriesId.group("curSeriesId") print("curModelId=%s, curSeriesId=%s", curModelId, curSeriesId) curModelSpecUrl = (modelSpecUrlTemplate) % (curModelId) print("curModelSpecUrl=", curModelSpecUrl) # 3. model status modelStatus = "在售" foundStopSale = eachLiAElt.find('i[class*="icon-stopsale"]') if foundStopSale: modelStatus = "停售" else: foundWseason = eachLiAElt.find('i[class*="icon-wseason"]') if foundWseason: modelStatus = "未上市" modelDetailDictList.append({ "url": curModelSpecUrl, "车系ID": curSeriesId, "车型ID": curModelId, "车型": curFullModelName, "状态": modelStatus }) print("modelDetailDictList=", modelDetailDictList) allSerieDictList = [] for curIdx, eachModelDetailDict in enumerate(modelDetailDictList): curSerieDict = { "品牌": mainBrandDict["text"], "子品牌": subBrandDict["text"], "车系": brandSerieDict["text"], "车系ID": eachModelDetailDict["车系ID"], "车型": eachModelDetailDict["车型"], "车型ID": eachModelDetailDict["车型ID"], "状态": eachModelDetailDict["状态"] } allSerieDictList.append(curSerieDict) # print("before send_message: [%d] curSerieDict=%s" % (curIdx, curSerieDict)) # self.send_message(self.project_name, curSerieDict, url=eachModelDetailDict["url"]) print("[%d] curSerieDict=%s" % (curIdx, curSerieDict)) self.crawl(eachModelDetailDict["url"], callback=self.carModelSpecPage, save=curSerieDict) # print("allSerieDictList=", allSerieDictList) # return allSerieDictList #def on_message(self, project, msg): # print("on_message: msg=", msg) # return msg @catch_status_code_error def carModelSpecPage(self, response): print("carModelSpecPage: response=", response) # https://www.autohome.com.cn/spec/32708/#pvareaid=2042128 curSerieDict = response.save print("curSerieDict", curSerieDict) # cityDealerPriceInt = 0 # cityDealerPriceElt = response.doc('.cardetail-infor-price #cityDealerPrice span span[class*="price"]') # print("cityDealerPriceElt=%s" % cityDealerPriceElt) # if cityDealerPriceElt: # cityDealerPriceFloatStr = cityDealerPriceElt.text() # print("cityDealerPriceFloatStr=", cityDealerPriceFloatStr) # cityDealerPriceFloat = float(cityDealerPriceFloatStr) # print("cityDealerPriceFloat=", cityDealerPriceFloat) # cityDealerPriceInt = int(cityDealerPriceFloat * 10000) # print("cityDealerPriceInt=", cityDealerPriceInt) msrpPriceInt = 0 # body > div.content > div.row > div.column.grid-16 > div.cardetail.fn-clear > div.cardetail-infor > div.cardetail-infor-price.fn-clear > ul > li.li-price.fn-clear > span # 厂商指导价=厂商建议零售价格=MSRP=Manufacturer's suggested retail price msrpPriceElt = response.doc('.cardetail-infor-price li[class*="li-price"] span[data-price]') print("msrpPriceElt=", msrpPriceElt) if msrpPriceElt: msrpPriceStr = msrpPriceElt.attr("data-price") print("msrpPriceStr=", msrpPriceStr) foundMsrpPrice = re.search("(?P<msrpPrice>[\d\.]+)万元", msrpPriceStr) print("foundMsrpPrice=", foundMsrpPrice) if foundMsrpPrice: msrpPrice = foundMsrpPrice.group("msrpPrice") print("msrpPrice=", msrpPrice) msrpPriceFloat = float(msrpPrice) print("msrpPriceFloat=", msrpPriceFloat) msrpPriceInt = int(msrpPriceFloat * 10000) print("msrpPriceInt=", msrpPriceInt) # curSerieDict["经销商参考价"] = cityDealerPriceInt curSerieDict["厂商指导价"] = msrpPriceInt return curSerieDict </code>
运行后,点击导出的csv:
保存出如下结果:
转载请注明:在路上 » 【已解决】写Python爬虫爬取汽车之家品牌车系车型数据