折腾:
【记录】用PySpider去爬取scholastic的绘本书籍数据
期间,已经写好代码,爬取出来要的数据了:

但是需要去保存到MongoDB中。
先去折腾:
【已解决】根据参数自动生成MongoDB的MongoClient的Uri参数
然后再去:
【已解决】Mac中保存json数据到本地MongoDB
然后就可以集成到PySPider中,且不需要额外用ResultWoker了
【总结】
然后用代码:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | #!/usr/bin/env python # -*- encoding: utf-8 -*- # Created on 2018-10-10 14:50:08 # Project: ScholasticStorybook from pyspider.libs.base_handler import * import re import json # import html import lxml from bs4 import BeautifulSoup from urllib.parse import quote_plus from pymongo import MongoClient HostUrl = " https: / / www.scholastic.com " # RowsPerPage = 24 RowsPerPage = 72 MONGODB_DB_NAME = "Scholastic" MONGODB_COLLECTION_NAME = "Storybook" ###################################################################### # Common Util Functions ###################################################################### def generateMongoUri(host = None , port = None , isUseAuth = False , username = None , password = None , authSource = None , authMechanism = None ): """"generate mongodb uri""" mongodbUri = "" if not host: # host = "127.0.0.0" host = "localhost" if not port: port = 27017 host, \ port ) # ' mongodb: / / localhost: 27017 ' # ' mongodb: / / xxx: 27017 ' if isUseAuth: quote_plus(username), \ quote_plus(password), \ host, \ port \ ) print (mongodbUri) if authSource: mongodbUri = mongodbUri + ( "/%s" % authSource) print ( "mongodbUri=%s" % mongodbUri) if authMechanism: mongodbUri = mongodbUri + ( "?authMechanism=%s" % authMechanism) print ( "mongodbUri=%s" % mongodbUri) print ( "return mongodbUri=%s" % mongodbUri) # mongodb: / / username:quoted_password@host:port / authSource?authMechanism = authMechanism # mongodb: / / localhost: 27017 return mongodbUri def createMongoClient(): mongoUri = generateMongoUri() print ( "mongoUri=%s" % mongoUri) client = MongoClient(mongoUri) print ( "client=%s" % client) # client=MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True) return client 。。。 ###################################################################### # Project Related ###################################################################### class ResultMongo( object ): def __init__( self ): print ( "ResultMongo __init__" ) self .client = createMongoClient() print ( "self.client=%s" % self .client) self .db = self .client[MONGODB_DB_NAME] print ( "self.db=%s" % self .db) # self.db=Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'Scholastic') self .collection = self .db[MONGODB_COLLECTION_NAME] print ( "self.collection=%s" % self .collection) # self.collection=Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'Scholastic'), 'Storybook') def __del__( self ): print ( "ResultMongo __del__" ) self .client.close() def on_result( self , result): """save result to mongodb""" print ( "ResultMongo on_result: result=%s" % result) respResult = None if result: respResult = self .collection.insert(result) print ( "respResult=%s" % respResult) # respResult=5bc45fad7f4d3847b78e8c69 return respResult def debugMongoResult(): mongo = ResultMongo() print ( "mongo=%s" % mongo) # mongo=<__main__.ResultMongo object at 0x10b24dcf8> dataDict = { 'authors' : [ 'Sharon Creech' ], 'coverImgUrl' : ' https: / / www.scholastic.com / content5 / media / products / 66 / 9780439569866_mres .jpg ', 'description' : "I guess it does\nlook like a poem\nwhen you see it\ntyped up\nlike that.\n\nJack hates poetry. Only girls write it and every time he tries to, his brain feels empty. But his teacher, Ms. Stretchberry, won't stop giving her class poetry assignments, and Jack can't avoid them. But then something amazing happens. The more he writes, the more he learns he does have something to say.\n\nWith a fresh and deceptively simple style, acclaimed author Sharon Creech tells a story with enormous heart. Written as a series of free-verse poems from Jack's point of view, Love That Dog shows how one boy finds his own voice with the help of a teacher, a writer, a pencil, some yellow paper, and of course, a dog." , 'draLevel' : '50' , 'genre' : 'Fiction' , 'gradeLevelEquivalent' : '', 'grades' : [ '6-8' ], 'guidedReading' : 'T' , 'illustrators' : [], 'isbn13' : '9780439569866' , 'lexileMeasure' : '1010L' , 'originUrl' : ' https: / / www.scholastic.com / content / scholastic / books2 / love - that - dog - by - sharon - creech ', 'pages' : 112 , 'recommendations' : [{ 'title' : "Girls' Life Ultimate Guide to Surviving Middle School" , 'url' : ' https: / / www.scholastic.com / content / scholastic / books2 / girls - rsquo - life - ultimate - guide - to - surviving - middle - school - by - b ' }, { 'title' : "Girls' Life Ultimate Guide To Surviving Middle School" , 'url' : ' https: / / www.scholastic.com / content / scholastic / books2 / girls - rsquo - life - ultimate - guide - to - surviving - middle - school - by - b ' }, { 'title' : 'The Date to Save' , 'url' : ' https: / / www.scholastic.com / content / scholastic / books2 / date - to - save - the - by - stephanie - kate - strohm ' } ], 'seriesName' : '', 'seriesNumber' : 0 , 'tags' : [ 'Poetry Writing' , 'School Life' ], 'title' : 'Love That Dog' , 'url' : ' https: / / www.scholastic.com / teachers / books / love - that - dog - by - sharon - creech / ' } respResult = mongo.on_result(dataDict) print ( "respResult=%s" % respResult) if __name__ = = "__main__" : debugMongoResult() ###################################################################### # PySpider Main Part ###################################################################### class Handler(BaseHandler): mongo = ResultMongo() print ( "mongo=%s" % mongo) crawl_config = { "User-Agent" : "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" , ... } # @every(minutes=24 * 60) def on_start( self ): ... # @config(priority=2) def singleBookCallback( self , response): ... respDict = { "originUrl" : originUrl, "url" : url, "title" : title, "coverImgUrl" : coverImgUrl, "authors" : authors, "illustrators" : illustrators, "seriesName" : seriesName, "seriesNumber" : seriesNumber, "grades" : grades, "guidedReading" : guidedReading, "lexileMeasure" : lexileMeasure, "gradeLevelEquivalent" : gradeLevelEquivalent, "draLevel" : draLevel, "genre" : genre, "description" : description, "pages" : pages, "isbn13" : isbn13, "tags" : tags, "recommendations" : recommendations, } print ( "respDict=%s" % respDict) return respDict def on_result( self , result): print ( "PySpider on_result: result=%s" % result) self .mongo.on_result(result) # 执行插入数据的操作 super (Handler, self ).on_result(result) # 调用原有的数据存储 |
然后运行到最后return的部分和on_result的部分,是可以输出对应的信息:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 | respDict={ 'originUrl' : ... PySpider on_result: result={ 'originUrl' :... ResultMongo on_result: result={ 'originUrl' : ... respResult=5bc464a1bfaa444ec0607173 { '_id' : ObjectId( '5bc464a1bfaa444ec0607173' ), 'authors' : [ 'Cynthia Lord' ], 'description' : 'This 2007 Newbery Honor Book is a humorous and heartwarming debut about feeling different and finding acceptance.\nTwelve-year-old Catherine just wants a normal life. Which is near impossible when you have a brother with autism and a family that revolves around his disability. She\'s spent years trying to teach David the rules - from "a peach is not a funny-looking apple" to "keep your pants on in public" - in order to stop his embarrassing behaviors. But the summer Catherine meets Jason, a paraplegic boy, and Kristi, the next-door friend she\'s always wished for, it\'s her own shocking behavior that turns everything upside down and forces her to ask: What is normal?\n"A heartwarming first novel" - Booklist' , 'draLevel' : '40' , 'genre' : 'Fiction' , 'gradeLevelEquivalent' : '' , 'grades' : [ '3-5' , '6-8' ], 'guidedReading' : 'R' , 'illustrators' : [], 'isbn13' : '9780439443838' , 'lexileMeasure' : '670L' , 'pages' : 224, 'recommendations' : [{ 'title' : 'Braced' , { 'title' : 'Chasing the Milky Way' , 'url' : 'https://www.scholastic.com/content/scholastic/books2/chasing-the-milky-way-by-erin-e-moulton' }, { 'title' : 'I Even Funnier' , 'url' : 'https://www.scholastic.com/content/scholastic/books2/i-even-funnier-by-chris-grabenstein' }], 'seriesName' : '' , 'seriesNumber' : 0, 'tags' : [ 'Autism' , 'Friends and Friendship' , 'Living with Illness and Disabilities' , 'Physical Challenges' , 'Siblings' , 'Tolerance and Acceptance' ], 'title' : 'Rules' , |

且保存出对应的数据到mongodb的:

转载请注明:在路上 » 【已解决】PySpider中把结果保存到MongoDB数据库中