折腾:
【记录】用PySpider去爬取scholastic的绘本书籍数据
期间,已经写好代码,爬取出来要的数据了:
但是需要去保存到MongoDB中。
先去折腾:
【已解决】根据参数自动生成MongoDB的MongoClient的Uri参数
然后再去:
【已解决】Mac中保存json数据到本地MongoDB
然后就可以集成到PySPider中,且不需要额外用ResultWoker了
【总结】
然后用代码:
#!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-10-10 14:50:08 # Project: ScholasticStorybook from pyspider.libs.base_handler import * import re import json # import html import lxml from bs4 import BeautifulSoup from urllib.parse import quote_plus from pymongo import MongoClient HostUrl = " https://www.scholastic.com " # RowsPerPage = 24 RowsPerPage = 72 MONGODB_DB_NAME = "Scholastic" MONGODB_COLLECTION_NAME = "Storybook" ###################################################################### # Common Util Functions ###################################################################### def generateMongoUri(host=None, port=None, isUseAuth=False, username=None, password=None, authSource=None, authMechanism=None): """"generate mongodb uri""" mongodbUri = "" if not host: # host = "127.0.0.0" host = "localhost" if not port: port = 27017 mongodbUri = "mongodb://%s:%s" % ( host, \ port ) # ' mongodb://localhost:27017 ' # ' mongodb://xxx:27017 ' if isUseAuth: mongodbUri = "mongodb://%s:%s@%s:%s" % ( quote_plus(username), \ quote_plus(password), \ host, \ port \ ) print(mongodbUri) if authSource: mongodbUri = mongodbUri + ("/%s" % authSource) print("mongodbUri=%s" % mongodbUri) if authMechanism: mongodbUri = mongodbUri + ("?authMechanism=%s" % authMechanism) print("mongodbUri=%s" % mongodbUri) print("return mongodbUri=%s" % mongodbUri) # mongodb://username:quoted_password@host:port/authSource?authMechanism=authMechanism # mongodb://localhost:27017 return mongodbUri def createMongoClient(): mongoUri = generateMongoUri() print("mongoUri=%s" % mongoUri) client = MongoClient(mongoUri) print("client=%s" % client) # client=MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True) return client 。。。 ###################################################################### # Project Related ###################################################################### class ResultMongo(object): def __init__(self): print("ResultMongo __init__") self.client = createMongoClient() print("self.client=%s" % self.client) self.db = self.client[MONGODB_DB_NAME] print("self.db=%s" % self.db) # self.db=Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'Scholastic') self.collection = self.db[MONGODB_COLLECTION_NAME] print("self.collection=%s" % self.collection) # self.collection=Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'Scholastic'), 'Storybook') def __del__(self): print("ResultMongo __del__") self.client.close() def on_result(self, result): """save result to mongodb""" print("ResultMongo on_result: result=%s" % result) respResult = None if result: respResult = self.collection.insert(result) print("respResult=%s" % respResult) # respResult=5bc45fad7f4d3847b78e8c69 return respResult def debugMongoResult(): mongo = ResultMongo() print("mongo=%s" % mongo) # mongo=<__main__.ResultMongo object at 0x10b24dcf8> dataDict = { 'authors': ['Sharon Creech'], 'coverImgUrl': ' https://www.scholastic.com/content5/media/products/66/9780439569866_mres.jpg ', 'description': "I guess it does\nlook like a poem\nwhen you see it\ntyped up\nlike that.\n\nJack hates poetry. Only girls write it and every time he tries to, his brain feels empty. But his teacher, Ms. Stretchberry, won't stop giving her class poetry assignments, and Jack can't avoid them. But then something amazing happens. The more he writes, the more he learns he does have something to say.\n\nWith a fresh and deceptively simple style, acclaimed author Sharon Creech tells a story with enormous heart. Written as a series of free-verse poems from Jack's point of view, Love That Dog shows how one boy finds his own voice with the help of a teacher, a writer, a pencil, some yellow paper, and of course, a dog.", 'draLevel': '50', 'genre': 'Fiction', 'gradeLevelEquivalent': '', 'grades': ['6-8'], 'guidedReading': 'T', 'illustrators': [], 'isbn13': '9780439569866', 'lexileMeasure': '1010L', 'originUrl': ' https://www.scholastic.com/content/scholastic/books2/love-that-dog-by-sharon-creech ', 'pages': 112, 'recommendations': [{ 'title': "Girls' Life Ultimate Guide to Surviving Middle School", 'url': ' https://www.scholastic.com/content/scholastic/books2/girls-rsquo-life-ultimate-guide-to-surviving-middle-school-by-b ' }, { 'title': "Girls' Life Ultimate Guide To Surviving Middle School", 'url': ' https://www.scholastic.com/content/scholastic/books2/girls-rsquo-life-ultimate-guide-to-surviving-middle-school-by-b ' }, { 'title': 'The Date to Save', 'url': ' https://www.scholastic.com/content/scholastic/books2/date-to-save-the-by-stephanie-kate-strohm ' } ], 'seriesName': '', 'seriesNumber': 0, 'tags': ['Poetry Writing', 'School Life'], 'title': 'Love That Dog', 'url': ' https://www.scholastic.com/teachers/books/love-that-dog-by-sharon-creech/ ' } respResult = mongo.on_result(dataDict) print("respResult=%s" % respResult) if __name__ == "__main__": debugMongoResult() ###################################################################### # PySpider Main Part ###################################################################### class Handler(BaseHandler): mongo = ResultMongo() print("mongo=%s" % mongo) crawl_config = { "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36", ... } # @every(minutes=24 * 60) def on_start(self): ... # @config(priority=2) def singleBookCallback(self, response): ... respDict = { "originUrl": originUrl, "url": url, "title": title, "coverImgUrl": coverImgUrl, "authors": authors, "illustrators": illustrators, "seriesName": seriesName, "seriesNumber": seriesNumber, "grades": grades, "guidedReading": guidedReading, "lexileMeasure": lexileMeasure, "gradeLevelEquivalent": gradeLevelEquivalent, "draLevel": draLevel, "genre": genre, "description": description, "pages": pages, "isbn13": isbn13, "tags": tags, "recommendations": recommendations, } print("respDict=%s" % respDict) return respDict def on_result(self, result): print("PySpider on_result: result=%s" % result) self.mongo.on_result(result) # 执行插入数据的操作 super(Handler, self).on_result(result) # 调用原有的数据存储
然后运行到最后return的部分和on_result的部分,是可以输出对应的信息:
respDict={'originUrl': ... PySpider on_result: result={'originUrl':... ResultMongo on_result: result={'originUrl': ... respResult=5bc464a1bfaa444ec0607173 {'_id': ObjectId('5bc464a1bfaa444ec0607173'), 'authors': ['Cynthia Lord'], 'coverImgUrl': 'https://www.scholastic.com/content5/media/products/38/9780439443838_mres.jpg', 'description': 'This 2007 Newbery Honor Book is a humorous and heartwarming debut about feeling different and finding acceptance.\nTwelve-year-old Catherine just wants a normal life. Which is near impossible when you have a brother with autism and a family that revolves around his disability. She\'s spent years trying to teach David the rules - from "a peach is not a funny-looking apple" to "keep your pants on in public" - in order to stop his embarrassing behaviors. But the summer Catherine meets Jason, a paraplegic boy, and Kristi, the next-door friend she\'s always wished for, it\'s her own shocking behavior that turns everything upside down and forces her to ask: What is normal?\n"A heartwarming first novel" - Booklist', 'draLevel': '40', 'genre': 'Fiction', 'gradeLevelEquivalent': '', 'grades': ['3-5', '6-8'], 'guidedReading': 'R', 'illustrators': [], 'isbn13': '9780439443838', 'lexileMeasure': '670L', 'originUrl': 'https://www.scholastic.com/content/scholastic/books2/rules-by-cynthia-lord', 'pages': 224, 'recommendations': [{'title': 'Braced', 'url': 'https://www.scholastic.com/content/scholastic/books2/braced-by-alyson-gerber'}, {'title': 'Chasing the Milky Way', 'url': 'https://www.scholastic.com/content/scholastic/books2/chasing-the-milky-way-by-erin-e-moulton'}, {'title': 'I Even Funnier', 'url': 'https://www.scholastic.com/content/scholastic/books2/i-even-funnier-by-chris-grabenstein'}], 'seriesName': '', 'seriesNumber': 0, 'tags': ['Autism', 'Friends and Friendship', 'Living with Illness and Disabilities', 'Physical Challenges', 'Siblings', 'Tolerance and Acceptance'], 'title': 'Rules', 'url': 'https://www.scholastic.com/teachers/books/rules-by-cynthia-lord/'}
且保存出对应的数据到mongodb的:
转载请注明:在路上 » 【已解决】PySpider中把结果保存到MongoDB数据库中