背景】
之前写的,前后共写了两个版本的:
Python 2.x版本
和
Python 3.x版本
去抓取
中联系人信息,并保存为excel文件
【scrape_chaosgroup_contact 代码分享】
1.截图:
(1)运行效果:
(2)保存为excel文件:
2.Python项目代码下载:
scrape_chaosgroup_contact_py2.7z
scrape_chaosgroup_contact_py3.7z
3.代码分享:
(1)Python 2.x版本的:scrape_chaosgroup_contact_py2.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Collect all data from a webpage https://www.elance.com/j/collect-all-data-from-webpage/34563264/ Version: 2012-10-25 Author: Crifan Li Contact: https://www.crifan.com/about/me/ ------------------------------------------------------------------------------- """ #---------------------------------import--------------------------------------- import os; import re; import sys; sys.path.append("libs/crifan"); sys.path.append("libs/thirdparty"); import math; import time; import codecs; import logging; import urllib; from datetime import datetime,timedelta; from optparse import OptionParser; from string import Template,replace; import xml; from xml.sax import saxutils; import crifanLib; from BeautifulSoup import BeautifulSoup,Tag,CData; import xlwt; #--------------------------------const values----------------------------------- __VERSION__ = "v0.1"; gConst = { }; #----------------------------------global values-------------------------------- gVal = { }; #--------------------------configurable values--------------------------------- gCfg ={ }; #--------------------------functions-------------------------------------------- #------------------------------------------------------------------------------ def main(): global gVal global gCfg allItemsDictList = []; mainUrl = "http://www.chaosgroup.com/en/2/purchase.html?g=0&pID=1"; logging.debug("mainUrl=%s", mainUrl); respHtml = crifanLib.getUrlRespHtml(mainUrl); logging.debug("respHtml=%s", respHtml); soup = BeautifulSoup(respHtml); foundAllItems = soup.findAll(attrs={"class":"countryInfo"}); logging.debug("foundAllItems=%s", foundAllItems); itemsLen = len(foundAllItems); logging.info("Total found %d contact info", itemsLen); for i,eachItemSoup in enumerate(foundAllItems): itemDict = { 'country':"", 'name' : "", 'phone' : "", 'fax' : "", 'email' : "", 'vRay' :"", 'maxLink':"", 'address':"", }; itemDict['country'] = eachItemSoup.h3.string; logging.debug("itemDict['country']=%s", itemDict['country']); foundName = eachItemSoup.find(attrs={"class":"name"}); if(foundName): itemDict['name'] = foundName.string; logging.debug("itemDict['name']=%s", itemDict['name']); else: logging.error("Can not find name"); sys.exit(2); foundPhone = eachItemSoup.find(attrs={"class":"phone"}); logging.debug("foundPhone=%s", foundPhone); if(foundPhone): foundPhoneUni = unicode(foundPhone); logging.debug("foundPhoneUni=%s", foundPhoneUni); # case 1: #<p class="phone"><strong>phone:</strong> 800.206.7886<br /> #<strong>fax:</strong> 503-295-6533</p> # case 2: # <p class="phone"><strong>phone:</strong> +1 800 854 4496 or outside US +1 407 833 0600<br /> # <strong>fax:</strong> +1 813 283 4906 # </p> # case 3: # <p class="phone"><strong>phone:</strong> 604 682 6639 x105 <br /><strong>phone:</strong> toll-free 1 800 682 6639 x105<br /> # <strong>fax:</strong> </p> foundPhoneFax = re.search("<strong>phone:</strong> (?P<phone>.+)<br />\s*?<strong>fax:</strong> (?P<fax>.*)</p>", foundPhoneUni, re.S); logging.debug("foundPhoneFax=%s", foundPhoneFax); if(foundPhoneFax): itemDict['phone'] = foundPhoneFax.group("phone"); itemDict['fax'] = foundPhoneFax.group("fax"); itemDict['phone'] = itemDict['phone'].strip(); itemDict['fax'] = itemDict['fax'].strip(); logging.debug("phone=%s,fax=%s", itemDict['phone'], itemDict['fax']); else: logging.error("Can not find phone and fax"); sys.exit(2); else: logging.error("Can not find phone"); sys.exit(2); foundWeb = eachItemSoup.find(attrs={"class":"web"}); logging.debug("foundWeb=%s", foundWeb); if(foundWeb): foundWebUni = unicode(foundWeb); logging.debug("foundWebUni=%s", foundWebUni); # <p class="web"><strong>e-mail:</strong> <a href="#">[email protected]</a><br /> # <strong>V-Ray|Max link:</strong> <a target="_blank" href="http://www.cinesysinc.com/page3/page20/page20.html">Cinesys</a> # </p> foundEmailInfo = re.search('<strong>e-mail:</strong> <a href="\#">(?P<email>.+)</a><br />\s*<strong>V-Ray\|Max link:</strong> <a target="_blank" href="(?P<maxLink>.+)">(?P<vRay>.+)</a>', foundWebUni); logging.debug("foundEmailInfo=%s", foundEmailInfo); if(foundEmailInfo): itemDict['email'] = foundEmailInfo.group("email"); itemDict['maxLink'] = foundEmailInfo.group("maxLink"); itemDict['vRay'] = foundEmailInfo.group("vRay"); itemDict['email'] = itemDict['email'].strip(); itemDict['maxLink'] = itemDict['maxLink'].strip(); itemDict['vRay'] = itemDict['vRay'].strip(); logging.debug("email=%s,maxLink=%s,vRay=%s", itemDict['email'], itemDict['maxLink'], itemDict['vRay']); else: logging.error("Can not find email info"); sys.exit(2); else: logging.error("Can not find web"); sys.exit(2); foundAddr = eachItemSoup.find(attrs={"class":"addr"}); logging.debug("foundAddr=%s", foundAddr); if(foundAddr): foundAddrUni = unicode(foundAddr); # <p class="addr"> # <strong>address:</strong> 740 SW 21st Ave, Suite #310<br /> # Portland 97205 Oregon;<br /> # USA </p> foundAddress = re.search('<p class="addr">\s*<strong>address:</strong> (?P<address>.+)</p>', foundAddrUni, re.S); if(foundAddress): itemDict['address'] = foundAddress.group("address"); itemDict['address'] = itemDict['address'].replace("<br />", ""); itemDict['address'] = itemDict['address'].strip(); logging.debug("address=%s", itemDict['address']); else: logging.error("Can not find address"); sys.exit(2); else: logging.error("Can not find addr"); sys.exit(2); logging.debug("----------------- parse [%d] OK: %s", i, itemDict); logging.info("Successfully processed %d contact info", i); allItemsDictList.append(itemDict); #output into excel style0 = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00'); style1 = xlwt.easyxf(num_format_str='D-MMM-YY'); styleBoldRed = xlwt.easyxf('font: name Times New Roman, color-index red, bold on'); wb = xlwt.Workbook(); ws = wb.add_sheet('AllContactInfo'); ws.write(0, 0, "Country", styleBoldRed); ws.write(0, 1, "Name", styleBoldRed); ws.write(0, 2, "Phone", styleBoldRed); ws.write(0, 3, "Fax", styleBoldRed); ws.write(0, 4, "Email", styleBoldRed); ws.write(0, 5, "Vray", styleBoldRed); ws.write(0, 6, "MaxLink", styleBoldRed); ws.write(0, 7, "Address", styleBoldRed); for idx,eachItemDict in enumerate(allItemsDictList): num = idx + 1; ws.write(num, 0, eachItemDict['country']); ws.write(num, 1, eachItemDict['name']); ws.write(num, 2, eachItemDict['phone']); ws.write(num, 3, eachItemDict['fax']); ws.write(num, 4, eachItemDict['email']); ws.write(num, 5, eachItemDict['vRay']); ws.write(num, 6, eachItemDict['maxLink']); ws.write(num, 7, eachItemDict['address']); excelFilename = "allExtractedWebsiteData.xls"; logging.info("Now save all data info excel file: %s", excelFilename); wb.save(excelFilename); ############################################################################### if __name__=="__main__": scriptSelfName = crifanLib.extractFilename(sys.argv[0]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s', datefmt = '%m-%d %H:%M', filename = scriptSelfName + ".log", filemode = 'w'); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s'); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try: main(); except: logging.exception("Unknown Error !"); raise;
(2)Python 3.x版本的:scrape_chaosgroup_contact_py3.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- [Function] Collect all data from a webpage https://www.elance.com/j/collect-all-data-from-webpage/34563264/ Version: 2012-10-25 Author: Crifan Li Contact: https://www.crifan.com/about/me/ [NOTE] This script is for Python 3.x before you can use this script, should do: 1.install bs4(BeautifulSoup version 4) http://www.crummy.com/software/BeautifulSoup/bs4/download/beautifulsoup4-4.1.3.tar.gz -> setup.py install 2. install xlwt3 http://pypi.python.org/pypi/xlwt3/0.1.0 -> http://pypi.python.org/packages/source/x/xlwt3/xlwt3-0.1.0.tar.gz -> setup.py install 3. modify installed xlwt3 after install, change Python32\Lib\site-packages\ -> xlwt3\BIFFRecords.py -> WriteAccessRecord -> __init__ from : self._rec_data = pack('%ds%ds' % (uowner_len, 0x70 - uowner_len), uowner, b' '*(0x70 - uowner_len)) # (to_py3): added b'...' to: self._rec_data = pack('%ds%ds' % (uowner_len, 0x70 - uowner_len), uowner.encode("utf-8"), b' '*(0x70 - uowner_len)) # (to_py3): added b'...' ------------------------------------------------------------------------------- """ #---------------------------------import--------------------------------------- import os; import re; import sys; sys.path.append("libs/crifan"); sys.path.append("libs/thirdparty"); import math; import time; import codecs; import logging; import urllib.request, urllib.parse, urllib.error; from datetime import datetime,timedelta; from optparse import OptionParser; import xml; from xml.sax import saxutils; import crifanLib; #from BeautifulSoup import BeautifulSoup,Tag,CData; from bs4 import BeautifulSoup,Tag,CData; #import xlwt; import xlwt3 as xlwt; #--------------------------------const values----------------------------------- __VERSION__ = "v0.1"; gConst = { }; #----------------------------------global values-------------------------------- gVal = { }; #--------------------------configurable values--------------------------------- gCfg ={ }; #------------------------------------------------------------------------------ def main(): global gVal global gCfg allItemsDictList = []; mainUrl = "http://www.chaosgroup.com/en/2/purchase.html?g=0&pID=1"; logging.debug("mainUrl=%s", mainUrl); respHtml = crifanLib.getUrlRespHtml(mainUrl); #print("type(respHtml)=", type(respHtml)); #respHtml = respHtml.decode("UTF-8"); #logging.debug("respHtml=%s", respHtml); soup = BeautifulSoup(respHtml, from_encoding="UTF-8"); foundAllItems = soup.findAll(attrs={"class":"countryInfo"}); #logging.debug("foundAllItems=%s", foundAllItems); itemsLen = len(foundAllItems); logging.info("Total found %d contact info", itemsLen); for i,eachItemSoup in enumerate(foundAllItems): itemDict = { 'country':"", 'name' : "", 'phone' : "", 'fax' : "", 'email' : "", 'vRay' :"", 'maxLink':"", 'address':"", }; itemDict['country'] = eachItemSoup.h3.string; #logging.debug("itemDict['country']=%s", itemDict['country']); foundName = eachItemSoup.find(attrs={"class":"name"}); if(foundName): itemDict['name'] = foundName.string; #logging.debug("itemDict['name']=%s", itemDict['name']); else: logging.error("Can not find name"); sys.exit(2); foundPhone = eachItemSoup.find(attrs={"class":"phone"}); #logging.debug("foundPhone=%s", foundPhone); if(foundPhone): #print("foundPhone=%s", foundPhone); #foundPhoneUni = str(foundPhone).encode("UTF-8"); foundPhoneUni = str(foundPhone); #print("foundPhoneUni=", foundPhoneUni); #print("type(foundPhoneUni)=", type(foundPhoneUni)); #print("foundPhoneUni.encode('GB18030')=%s", foundPhoneUni.encode('GB18030')); #print("type(foundPhone)=", type(foundPhone)); #foundPhoneString = foundPhone.string; #print("type(foundPhoneString)=", type(foundPhoneString)); #print("foundPhoneString=", foundPhoneString); #foundPhoneUni = foundPhone.decode("UTF-8"); #logging.debug("foundPhoneUni=%s", foundPhoneUni); # case 1: #<p class="phone"><strong>phone:</strong> 800.206.7886<br /> #<strong>fax:</strong> 503-295-6533</p> # case 2: # <p class="phone"><strong>phone:</strong> +1 800 854 4496 or outside US +1 407 833 0600<br /> # <strong>fax:</strong> +1 813 283 4906 # </p> # case 3: # <p class="phone"><strong>phone:</strong> 604 682 6639 x105 <br /><strong>phone:</strong> toll-free 1 800 682 6639 x105<br /> # <strong>fax:</strong> </p> #foundPhoneFax = re.search("<strong>phone:</strong> (?P<phone>.+)<br />\s*?<strong>fax:</strong> (?P<fax>.*)</p>", foundPhoneUni, re.S); #print("dir(foundPhoneUni)=", dir(foundPhoneUni)); #foundPhoneUtf8 = foundPhoneUni.encode("UTF-8"); #print("foundPhoneUtf8=", foundPhoneUtf8); #foundPhoneFax = re.search("<strong>phone:</strong> (?P<phone>.+)<br />\s*?<strong>fax:</strong> (?P<fax>.*)</p>", foundPhoneUtf8, re.S); #<p class="phone"><strong>phone:</strong> 866-905-2050<br />\r\n\t\t\t\t<strong>fax:</strong> 800 542 7928</p> #foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />\s*?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S); #foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />.+?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S); #foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br />", foundPhoneUni); #foundPhoneUtf8= b'<p class="phone"><strong>phone:</strong>\xc2\xa0800.206.7886<br/>\n<strong>fax:</strong>\xc2\xa0503-295-6533</p>' foundPhoneFax = re.search("<strong>phone:</strong>(?P<phone>.+)<br\s*/>\s*?<strong>fax:</strong>(?P<fax>.*)</p>", foundPhoneUni, re.S); #logging.debug("foundPhoneFax=%s", foundPhoneFax); if(foundPhoneFax): itemDict['phone'] = foundPhoneFax.group("phone"); itemDict['fax'] = foundPhoneFax.group("fax"); itemDict['phone'] = itemDict['phone'].strip(); itemDict['fax'] = itemDict['fax'].strip(); #logging.debug("phone=%s,fax=%s", itemDict['phone'], itemDict['fax']); else: logging.error("Can not find phone and fax"); sys.exit(2); else: logging.error("Can not find phone"); sys.exit(2); foundWeb = eachItemSoup.find(attrs={"class":"web"}); #logging.debug("foundWeb=%s", foundWeb); if(foundWeb): foundWebUni = str(foundWeb); #logging.debug("foundWebUni=%s", foundWebUni); # <p class="web"><strong>e-mail:</strong> <a href="#">[email protected]</a><br /> # <strong>V-Ray|Max link:</strong> <a target="_blank" href="http://www.cinesysinc.com/page3/page20/page20.html">Cinesys</a> # </p> #foundEmailInfo = re.search('<strong>e-mail:</strong> <a href="\#">(?P<email>.+)</a><br />\s*<strong>V-Ray\|Max link:</strong> <a target="_blank" href="(?P<maxLink>.+)">(?P<vRay>.+)</a>', foundWebUni); #foundWebUtf8 = foundWebUni.encode("UTF-8"); #print("foundWebUtf8=", foundWebUtf8); # foundWebUtf8= b'<p class="web"><strong>e-mail:</strong>\xc2\xa0<a href="#">[email protected]</a><br/>\n<strong>V-Ray|Max link:</strong>\xc2\xa0<a href="http://www.3dv.com/#/Rendering_Solutions/Chaos_Group/VRay/" target="_blank">3DV Corporation</a>\n</p>' foundEmailInfo = re.search('<strong>e-mail:</strong>.*?<a href="\#">(?P<email>.+)</a><br\s*/>\s*<strong>V-Ray\|Max link:</strong>.*?<a href="(?P<maxLink>.+)" target="_blank">(?P<vRay>.+)</a>', foundWebUni); #logging.debug("foundEmailInfo=%s", foundEmailInfo); if(foundEmailInfo): itemDict['email'] = foundEmailInfo.group("email"); itemDict['maxLink'] = foundEmailInfo.group("maxLink"); itemDict['vRay'] = foundEmailInfo.group("vRay"); itemDict['email'] = itemDict['email'].strip(); itemDict['maxLink'] = itemDict['maxLink'].strip(); itemDict['vRay'] = itemDict['vRay'].strip(); #logging.debug("email=%s,maxLink=%s,vRay=%s", itemDict['email'], itemDict['maxLink'], itemDict['vRay']); else: logging.error("Can not find email info"); sys.exit(2); else: logging.error("Can not find web"); sys.exit(2); foundAddr = eachItemSoup.find(attrs={"class":"addr"}); #logging.debug("foundAddr=%s", foundAddr); if(foundAddr): foundAddrUni = str(foundAddr); # <p class="addr"> # <strong>address:</strong> 740 SW 21st Ave, Suite #310<br /> # Portland 97205 Oregon;<br /> # USA </p> #foundAddress = re.search('<p class="addr">\s*<strong>address:</strong> (?P<address>.+)</p>', foundAddrUni, re.S); #foundAddrUtf8 = foundAddrUni.encode("UTF-8"); #print("foundAddrUtf8=", foundAddrUtf8); #foundAddrUtf8= b'<p class="addr">\n<strong>address:</strong>\xc2\xa0Kiacheli, 26<br/>\r\n\t\t\t\t\tTbilisi 0108 ;<br/>\r\n\t\t\t\t\tGeorgia\t\t\t\t</p>' foundAddress = re.search('<p class="addr">\s*<strong>address:</strong>(?P<address>.+)</p>', foundAddrUni, re.S); if(foundAddress): itemDict['address'] = foundAddress.group("address"); #itemDict['address'] = itemDict['address'].replace("<br />", ""); itemDict['address'] = re.sub("<br\s*/>", "", itemDict['address']); itemDict['address'] = itemDict['address'].strip(); #logging.debug("address=%s", itemDict['address']); else: logging.error("Can not find address"); sys.exit(2); else: logging.error("Can not find addr"); sys.exit(2); #logging.debug("----------------- parse [%d] OK: %s", i, itemDict); logging.info("Successfully processed %d contact info", i); allItemsDictList.append(itemDict); #output into excel style0 = xlwt.easyxf('font: name Times New Roman, color-index red, bold on',num_format_str='#,##0.00'); style1 = xlwt.easyxf(num_format_str='D-MMM-YY'); styleBoldRed = xlwt.easyxf('font: name Times New Roman, color-index red, bold on'); wb = xlwt.Workbook(); ws = wb.add_sheet('AllContactInfo'); ws.write(0, 0, "Country", styleBoldRed); ws.write(0, 1, "Name", styleBoldRed); ws.write(0, 2, "Phone", styleBoldRed); ws.write(0, 3, "Fax", styleBoldRed); ws.write(0, 4, "Email", styleBoldRed); ws.write(0, 5, "Vray", styleBoldRed); ws.write(0, 6, "MaxLink", styleBoldRed); ws.write(0, 7, "Address", styleBoldRed); for idx,eachItemDict in enumerate(allItemsDictList): num = idx + 1; ws.write(num, 0, eachItemDict['country']); ws.write(num, 1, eachItemDict['name']); ws.write(num, 2, eachItemDict['phone']); ws.write(num, 3, eachItemDict['fax']); ws.write(num, 4, eachItemDict['email']); ws.write(num, 5, eachItemDict['vRay']); ws.write(num, 6, eachItemDict['maxLink']); ws.write(num, 7, eachItemDict['address']); excelFilename = "allExtractedWebsiteData.xls"; logging.info("Now save all data info excel file: %s", excelFilename); wb.save(excelFilename); ############################################################################### if __name__=="__main__": scriptSelfName = crifanLib.extractFilename(sys.argv[0]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s', datefmt = '%m-%d %H:%M', filename = scriptSelfName + ".log", filemode = 'w'); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s'); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try: main(); except: logging.exception("Unknown Error !"); raise;
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_chaosgroup_contact(Python 2.x版本和Python 3.x版本) – 抓取chaosgroup.com中的联系人信息保存为excel