之前已经弄完了基本的PySpider的项目,把数据直接return返回,在运行后,点击results,导出csv了:
现在想要去把数据保存到mysql中。
先去看官网教程:
Working with Results – pyspider
难道是需要
先去本地mysql数据库中,先建立好对应的数据库和表结构?
然后才能保存?
不过好像是去利用ResultWorker保存数据的
pyspider save to mysql
Pyspider把抓取的结果存入mysql数据库 | 拈花古佛
Pyspider实例之抓取数据并保存到MySQL数据库 – CSDN博客
如何把采集结果存入mysql – 使用教程 – pyspider中文网
前面都是重写on_result,好像是旧的做法。
实现Pyspider爬虫结果的自定义ResultWorker – 简书
这个重写ResultWorker,才是新的做法
参考:
实现Pyspider爬虫结果的自定义ResultWorker – 简书
和
然后同时去运行mysql server:
<code>➜ ~ /usr/local/mysql/support-files/mysql.server status ERROR! MySQL is not running ➜ ~ /usr/local/mysql/support-files/mysql.server start Starting MySQL . SUCCESS! ➜ ~ /usr/local/mysql/support-files/mysql.server status SUCCESS! MySQL running (61419) </code>
然后再去创建对应的mysql数据库:
【记录】Sequel中新建mysql数据库并新建表和相应字段
然后去写代码操作之前,需要先去搞清楚:
再去写配置和自己的worker
先去新建配置文件:
config.json
<code>{ "taskdb": "mysql+taskdb://root:[email protected]:3306/AutohomeTaskdb", "projectdb": "mysql+projectdb://root:[email protected]:3306/AutohomeProjectdb", "resultdb": "mysql+resultdb://root:[email protected]:3306/AutohomeResultdb", "result_worker":{ "result_cls": "AutohomeResultWorker.AutohomeResultWorker" } } </code>
然后去运行试试
结果出错:
【已解决】pyspider中运行result_worker出错:ModuleNotFoundError No module named mysql
最后知道了,运行要用:
<code>pyspider -c config.json </code>
才能确保:运行了webui
-》浏览器能打开:
然后没了mysql问题,但是出现别的问题:
【已解决】pyspider中出错:TypeError __init__() got an unexpected keyword argument resultdb
然后解决了后,代码是:
<code>#!/usr/bin/env python # -*- encoding: utf-8 -*- # Project: autohomeBrandData # Function: implement custom result worker for autohome car data # Author: Crifan Li # Date: 20180512 # Note: # If you want to modify to your mysql and table, you need: # (1) change change MysqlDb config to your mysql config # (2) change CurrentTableName to your table name # (3) change CreateTableSqlTemplate to your sql to create new mysql table fields # (4) before use this ResultWorker, run py file to execute testMysqlDb, to init db and create table # (5) if your table field contain more type, edit insert to add more type for "TODO: add more type formatting if necessary" import pymysql import pymysql.cursors from pyspider.result import ResultWorker CurrentTableName = "tbl_autohome_car_info" CreateTableSqlTemplate = """CREATE TABLE IF NOT EXISTS `%s` ( `id` int(11) unsigned NOT NULL AUTO_INCREMENT COMMENT '自增,主键', `cityDealerPrice` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '经销商参考价', `msrpPrice` int(11) unsigned NOT NULL DEFAULT '0' COMMENT '厂商指导价', `mainBrand` char(20) NOT NULL DEFAULT '' COMMENT '品牌', `subBrand` varchar(20) NOT NULL DEFAULT '' COMMENT '子品牌', `brandSerie` varchar(20) NOT NULL DEFAULT '' COMMENT '车系', `brandSerieId` varchar(15) NOT NULL DEFAULT '' COMMENT '车系ID', `model` varchar(50) NOT NULL DEFAULT '' COMMENT '车型', `modelId` varchar(15) NOT NULL DEFAULT '' COMMENT '车型ID', `modelStatus` char(5) NOT NULL DEFAULT '' COMMENT '车型状态', `url` varchar(200) NOT NULL DEFAULT '' COMMENT '车型url', PRIMARY KEY (`id`) ) ENGINE=InnoDB DEFAULT CHARSET=utf8;""" class AutohomeResultWorker(ResultWorker): def __init__(self, resultdb, inqueue): """init mysql db""" print("AutohomeResultWorker init: resultdb=%s, inqueue=%s" % (resultdb, inqueue)) ResultWorker.__init__(self, resultdb, inqueue) self.mysqlDb = MysqlDb() print("self.mysqlDb=%s" % self.mysqlDb) def on_result(self, task, result): """override pyspider on_result to save data into mysql""" # assert task['taskid'] # assert task['project'] # assert task['url'] # assert result print("AutohomeResultWorker on_result: task=%s, result=%s" % (task, result)) insertOk = self.mysqlDb.insert(result) print("insertOk=%s" % insertOk) class MysqlDb: config = { 'host': '127.0.0.1', 'port': 3306, 'user': 'root', 'password': 'crifan_mysql', 'database': 'AutohomeResultdb', 'charset': "utf8" } defaultTableName = CurrentTableName connection = None def __init__(self): """init mysql""" # 1. connect db first if self.connection is None: isConnected = self.connect() print("Connect mysql return %s" % isConnected) # 2. create table for db createTableOk = self.createTable(self.defaultTableName) print("Create table %s return %s" %(self.defaultTableName, createTableOk)) def connect(self): try: self.connection = pymysql.connect(**self.config, cursorclass=pymysql.cursors.DictCursor) print("connect mysql ok, self.connection=", self.connection) return True except pymysql.Error as err: print("Connect mysql with config=", self.config, " error=", err) return False def quoteIdentifier(self, identifier): """ for mysql, it better to quote identifier xxx using backticks to `xxx` in case, identifier: contain special char, such as space or same with system reserved words, like select """ quotedIdentifier = "`%s`" % identifier # print("quotedIdentifier=", quotedIdentifier) return quotedIdentifier def executeSql(self, sqlStr, actionDescription=""): print("executeSql: sqlStr=%s, actionDescription=%s" % (sqlStr, actionDescription)) if self.connection is None: print("Please connect mysql first before %s" % actionDescription) return False cursor = self.connection.cursor() print("cursor=", cursor) try: cursor.execute(sqlStr) self.connection.commit() return True except pymysql.Error as err: print("Execute sql %s occur error %s for %s" % (sqlStr, err, actionDescription)) return False def createTable(self, newTablename): print("createTable: newTablename=", newTablename) createTableSql = CreateTableSqlTemplate % (newTablename) print("createTableSql=", createTableSql) return self.executeSql(sqlStr=createTableSql, actionDescription=("Create table %s" % newTablename)) def dropTable(self, existedTablename): print("dropTable: existedTablename=", existedTablename) dropTableSql = "DROP TABLE IF EXISTS %s" % (existedTablename) print("dropTableSql=", dropTableSql) return self.executeSql(sqlStr=dropTableSql, actionDescription=("Drop table %s" % existedTablename)) # def insert(self, **valueDict): def insert(self, valueDict, tablename=defaultTableName): """ inset dict value into mysql table makesure the value is dict, and its keys is the key in the table """ print("insert: valueDict=%s, tablename=%s" % (valueDict, tablename)) dictKeyList = valueDict.keys() dictValueList = valueDict.values() print("dictKeyList=", dictKeyList, "dictValueList=", dictValueList) keyListSql = ", ".join(self.quoteIdentifier(eachKey) for eachKey in dictKeyList) print("keyListSql=", keyListSql) # valueListSql = ", ".join(eachValue for eachValue in dictValueList) valueListSql = "" formattedDictValueList = [] for eachValue in dictValueList: # print("eachValue=", eachValue) eachValueInSql = "" valueType = type(eachValue) # print("valueType=", valueType) if valueType is str: eachValueInSql = '"%s"' % eachValue elif valueType is int: eachValueInSql = '%d' % eachValue # TODO: add more type formatting if necessary print("eachValueInSql=", eachValueInSql) formattedDictValueList.append(eachValueInSql) valueListSql = ", ".join(eachValue for eachValue in formattedDictValueList) print("valueListSql=", valueListSql) insertSql = """INSERT INTO %s (%s) VALUES (%s)""" % (tablename, keyListSql, valueListSql) print("insertSql=", insertSql) # INSERT INTO tbl_car_info_test (`url`, `mainBrand`, `subBrand`, `brandSerie`, `brandSerieId`, `model`, `modelId`, `modelStatus`, `cityDealerPrice`, `msrpPrice`) VALUES ("https://www.autohome.com.cn/spec/5872/#pvareaid=2042128", "宝马", "华晨宝马", "宝马3系", "66", "2010款 320i 豪华型", "5872", "停售", 325000, 375000) return self.executeSql(sqlStr=insertSql, actionDescription=("Insert value to table %s" % tablename)) def delete(self, modelId, tablename=defaultTableName): """ delete item from car model id for existing table of autohome car info """ print("delete: modelId=%s, tablename=%s" % (modelId, tablename)) deleteSql = """DELETE FROM %s WHERE modelId = %s""" % (tablename, modelId) print("deleteSql=", deleteSql) return self.executeSql(sqlStr=deleteSql, actionDescription=("Delete value from table %s by model id %s" % (tablename, modelId))) def testMysqlDb(): """test mysql""" testDropTable = True testCreateTable = True testInsertValue = True testDeleteValue = True # 1.test connect mysql mysqlObj = MysqlDb() print("mysqlObj=", mysqlObj) # testTablename = "autohome_car_info" # testTablename = "tbl_car_info_test" testTablename = CurrentTableName print("testTablename=", testTablename) if testDropTable: # 2. test drop table dropTableOk = mysqlObj.dropTable(testTablename) print("dropTable", testTablename, "return", dropTableOk) if testCreateTable: # 3. test create table createTableOk = mysqlObj.createTable(testTablename) print("createTable", testTablename, "return", createTableOk) if testInsertValue: # 4. test insert value dict valueDict = { "url": "https://www.autohome.com.cn/spec/5872/#pvareaid=2042128", #车型url "mainBrand": "宝马", #品牌 "subBrand": "华晨宝马", #子品牌 "brandSerie": "宝马3系", #车系 "brandSerieId": "66", #车系ID "model": "2010款 320i 豪华型", #车型 "modelId": "5872", #车型ID "modelStatus": "停售", #车型状态 "cityDealerPrice": 325000, #经销商参考价 "msrpPrice": 375000 # 厂商指导价 } print("valueDict=", valueDict) insertOk = mysqlObj.insert(valueDict=valueDict, tablename=testTablename) print("insertOk=", insertOk) if testDeleteValue: toDeleteModelId = "5872" deleteOk = mysqlObj.delete(modelId=toDeleteModelId, tablename=testTablename) print("deleteOk=", deleteOk) def testAutohomeResultWorker(): """just test for create mysql db is ok or not""" autohomeResultWorker = AutohomeResultWorker(None, None) print("autohomeResultWorker=%s" % autohomeResultWorker) if __name__ == '__main__': testMysqlDb() # testAutohomeResultWorker() </code>
去运行:
<code>pyspider -c config.json </code>
但是运行了很长时间之后,出错:
【未解决】pyspider运行出错:FETCH_ERROR HTTP 599 Connection timed out after milliseconds
以及期间发现:
【已解决】pyspider中pymysql中insert失败且except部分代码没有执行
转载请注明:在路上 » 【已解决】PySpider中保存数据到mysql