背景】
之前写的,去处理本地已有的一个html文件,
然后对于提取出来的信息,导出为,各种形式的json字符串。
【scrape_html_to_json代码分享】
1.截图:
(1)运行效果:
(2)输出的各种json字符串:
A。无格式化,无缩进:
[{"yearMonth": {"month": {"string": "November", "value": "11"}, "year": {"string": "2012", "value": "2012"}}, "reservedMonthList": ["2", "3", "8", "9", "10", "11", "12", "13", "17", "18", "19", "20", "21", "22", "23"]}, {"yearMonth": {"month": {"string": "December", "value": "12"}, "year": {"string": "2012", "value": "2012"}}, "reservedMonthList": ["7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "21", "22", "23", "24", "25", "26", "27", "28", "30", "31"]}]
B。普通的:
$calendar = {"listing_id1":{"1": {"start_date": 2/11/2012, "end_date": 3/11/2012, "status": reserved },"2": {"start_date": 8/11/2012, "end_date": 13/11/2012, "status": reserved },"3": {"start_date": 17/11/2012, "end_date": 23/11/2012, "status": reserved },}, "listing_id2":{"1": {"start_date": 7/12/2012, "end_date": 16/12/2012, "status": reserved },"2": {"start_date": 21/12/2012, "end_date": 28/12/2012, "status": reserved },"3": {"start_date": 30/12/2012, "end_date": 31/12/2012, "status": reserved },}, "listing_id3":{"1": {"start_date": 1/1/2013, "end_date": 10/1/2013, "status": reserved },}, "listing_id4":{"1": {"start_date": 1/2/2013, "end_date": 27/2/2013, "status": reserved },}, "listing_id5":{}, "listing_id6":{"1": {"start_date": 2/4/2013, "end_date": 30/4/2013, "status": reserved },}, "listing_id7":{"1": {"start_date": 1/5/2013, "end_date": 31/5/2013, "status": reserved },}, "listing_id8":{"1": {"start_date": 1/6/2013, "end_date": 30/6/2013, "status": reserved },}, "listing_id9":{"1": {"start_date": 1/7/2013, "end_date": 31/7/2013, "status": reserved },}, "listing_id10":{"1": {"start_date": 1/8/2013, "end_date": 31/8/2013, "status": reserved },}, "listing_id11":{"1": {"start_date": 1/9/2013, "end_date": 30/9/2013, "status": reserved },}, "listing_id12":{"1": {"start_date": 1/10/2013, "end_date": 31/10/2013, "status": reserved },}, }
C。带缩进的格式化的json:
[ { "yearMonth": { "month": { "string": "November", "value": "11" }, "year": { "string": "2012", "value": "2012" } }, "reservedMonthList": [ "2", "3", "8", "9", "10", "11", "12", "13", "17", "18", "19", "20", "21", "22", "23" ] }, { "yearMonth": { "month": { "string": "December", "value": "12" }, "year": { "string": "2012", "value": "2012" } }, "reservedMonthList": [ "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "21", "22", "23", "24", "25", "26", "27", "28", "30", "31" ] } ]
注:以上内容不全部相同。只是为了显示效果。
2.Python项目代码下载:
scrape_html_to_json_2012-11-08.7z
3.代码分享:
(1)scrape_html_to_json.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Function: Web Scraper https://www.elance.com/j/web-scraper/35025238/ Version: 2012-11-08 Author: Crifan Li Contact: https://www.crifan.com/about/me/ ------------------------------------------------------------------------------- """ #---------------------------------import--------------------------------------- import re; import sys; sys.path.append("libs"); #import urllib; import codecs; from string import Template,replace; import json; from BeautifulSoup import BeautifulSoup,Tag,CData; #------------------------------------------------------------------------------ # from ['2','3','8','9','10','11','12','13','17','18','19','20','21','22','23'] # to #[ #{ #'startDay': 2, #'endDay' : 3, #}, #{ #'startDay': 8, #'endDay' : 13, #}, #{ #'startDay': 17, #'endDay' : 23, #}, #] def generateDurationDictList(monthList): durationMonthDictList = []; #print "monthList=",monthList; if(monthList): monthIntList = []; for eachMonthStr in monthList: monthIntList.append(int(eachMonthStr)); monthIntList.sort(); #print "monthIntList=",monthIntList; curStartMonth = monthIntList.pop(0); #print "curStartMonth=",curStartMonth; curEndMonth = curStartMonth; curInterMonth = curStartMonth; startFindNewDuration = False; while(monthIntList): currentMonthInt = monthIntList.pop(0); #print "---currentMonthInt=",currentMonthInt; if(currentMonthInt == (curInterMonth+1)): startFindNewDuration = True; curInterMonth = curInterMonth + 1; #print "after add 1, curInterMonth=",curInterMonth; else: durationInfoDict = { 'startDay': curStartMonth, 'endDay' : curInterMonth, }; durationMonthDictList.append(durationInfoDict); startFindNewDuration = False; curEndMonth = currentMonthInt; curStartMonth = currentMonthInt; curInterMonth = currentMonthInt; if(startFindNewDuration): startFindNewDuration = False; durationInfoDict = { 'startDay': curStartMonth, 'endDay' : currentMonthInt, }; durationMonthDictList.append(durationInfoDict); #print "durationMonthDictList=",durationMonthDictList; #else: #print "input monthList is null"; return durationMonthDictList; #------------------------------------------------------------------------------ def generateOutputCalendar(MonthDictList): #print "MonthDictList=",MonthDictList; wholeStr = ""; headerStr = "$calendar = {"; tailStr = "}"; allMonthStr = ""; for index,eachMonthDict in enumerate(MonthDictList): number = index + 1; singleMonthWholeStr = ""; monthHeaderStr = '"listing_id'+str(number)+'":{'; monthTailStr = "},"; monthDurationListStr = ""; #print "============ now process year=%s, month=%s"%(eachMonthDict['yearMonth']['year']['string'], eachMonthDict['yearMonth']['month']['string']); durationInfoDictList = generateDurationDictList(eachMonthDict['reservedMonthList']); for durationIdx,eachDurationDict in enumerate(durationInfoDictList): durationNum = durationIdx + 1; singelDurationT = Template(""""${number}": {"start_date": ${startDay}/${startMonth}/${startYear}, "end_date": ${endDay}/${endMonth}/${endYear}, "status": reserved },"""); singleDurationDict = { 'number' : durationNum, 'startDay' : eachDurationDict['startDay'], 'startMonth' : eachMonthDict['yearMonth']['month']['value'], 'startYear' : eachMonthDict['yearMonth']['year']['value'], 'endDay' : eachDurationDict['endDay'], 'endMonth' : eachMonthDict['yearMonth']['month']['value'], 'endYear' : eachMonthDict['yearMonth']['year']['value'], }; # "1": # {"start_date": 11/7/2012, # "end_date": 11/9/2012, # "status": reserved # }, singelDurationStr = singelDurationT.substitute(singleDurationDict); #print "singelDurationStr=",singelDurationStr; monthDurationListStr += singelDurationStr; singleMonthWholeStr = monthHeaderStr + monthDurationListStr + monthTailStr; #print "singleMonthWholeStr=",singleMonthWholeStr; allMonthStr += singleMonthWholeStr + "\r\n\r\n"; wholeStr = headerStr + allMonthStr + tailStr; #print "wholeStr=",wholeStr; return wholeStr; #------------------------------------------------------------------------------ def generateOutputCalendarJsonNoIndent(MonthDictList): jsonDumpsNoIndent = json.dumps(MonthDictList); #print "jsonDumpsNoIndent=",jsonDumpsNoIndent; return jsonDumpsNoIndent; #------------------------------------------------------------------------------ def main(): testEntryUrl = "http://testingsite.com/CalendarViewPublic.asp?HouseID=39"; foundSingleAttrFromUrl = re.search("http:.+?\?(?P<singleAttr>\w+)=.*?", testEntryUrl); #print "foundSingleAttrFromUrl=",foundSingleAttrFromUrl; if(foundSingleAttrFromUrl): singleAttr = foundSingleAttrFromUrl.group("singleAttr"); print "Extract singleAttr=%s from testEntryUrl=%s"%(singleAttr, testEntryUrl); testFilename = "testfiles/test_scrape.htm"; htmlFile = codecs.open(testFilename, 'r', "UTF-8"); #print "htmlFile=",htmlFile; testHtml = htmlFile.read(); #print "testHtml=",testHtml; soup = BeautifulSoup(testHtml); #<table border="0" cellpadding="2" cellspacing="0" class="text" width="100%"> foundAllMonthHeader = soup.findAll(name="table", attrs={"class":"text"}); #print "foundAllMonthHeader=",foundAllMonthHeader; monthHeaderLen = len(foundAllMonthHeader); #print "monthHeaderLen=",monthHeaderLen; #<table border="1" class="CalendarCellActive" cellpadding="2" cellspacing="0" style=" border: 1px solid navy; table-layout:fixed" width="100%"> foundAllMonthContent = soup.findAll(name="table", attrs={"class":"CalendarCellActive"}); #print "foundAllMonthContent=",foundAllMonthContent; monthContentLen = len(foundAllMonthContent); #print "monthContentLen=",monthContentLen; print "Total found %d month's info of reserved days"%(monthContentLen); MonthDictList = []; for i,eachMonthHeader in enumerate(foundAllMonthHeader): singleMonthDict = { 'yearMonth' :{ 'year' : { 'value' : "", 'string': "", }, 'month' : { 'value' : "", 'string': "", }, }, 'reservedMonthList':[], # each one is singel string of month }; #Note: #here, actually, the simplest method to extract the year and month label is: #just find two label, then consider the first is month and second is year # foundTwoLabel = eachMonthHeader.findAll("label"); # print "foundTwoLabel=",foundTwoLabel; # monthLabel = foundTwoLabel[0]; # yearLabel = foundTwoLabel[1]; # monthStr = monthLabel.string; # yearStr = yearLabel.string; # print "monthStr=",monthStr; # monthStr= November # print "yearStr=",yearStr; # yearStr= 2012 # but that kind of method is not safe and robust #so use following code # <td style="padding-left:0" width="60%"><label>November</label> # <input type="Hidden" id="cboMonth1" name="cboMonth1" value="11"> # </td><td style="padding-right:0;" width="40%"> # <label>2012</label> # <input type="Hidden" id="cboYear1" name="cboYear1" value="2012"> # </td> foundCboMonth = eachMonthHeader.find("input", {"id":re.compile("cboMonth\d+")}); #print "foundCboMonth=",foundCboMonth; monthValue = foundCboMonth['value']; #print "monthValue=",monthValue; tdMonth = foundCboMonth.parent; #print "tdMonth=",tdMonth; tdMonthLabel = tdMonth.label; #print "tdMonthLabel=",tdMonthLabel; monthStr = tdMonthLabel.string; #print "monthStr=",monthStr; foundCboYear = eachMonthHeader.find("input", {"id":re.compile("cboYear\d+")}); #print "foundCboYear=",foundCboYear; yearValue = foundCboYear['value']; #print "yearValue=",yearValue; tdYear = foundCboYear.parent; #print "tdYear=",tdYear; tdYearLabel = tdYear.label; #print "tdYearLabel=",tdYearLabel; yearStr = tdYearLabel.string; #print "yearStr=",yearStr; singleMonthDict['yearMonth']['month']['string'] = monthStr; singleMonthDict['yearMonth']['month']['value'] = monthValue; singleMonthDict['yearMonth']['year']['string'] = yearStr; singleMonthDict['yearMonth']['year']['value'] = yearValue; # extract the necessary content: the reserved days eachMonthContent = foundAllMonthContent[i]; #<td align="center" class="CalendarCellReserved" id="dd1">2</td> foundAllReservedCell = eachMonthContent.findAll("td", {"class":"CalendarCellReserved"}); #print "foundAllReservedCell=",foundAllReservedCell; reservedCellNum = len(foundAllReservedCell); #print "reservedCellNum=",reservedCellNum; for eachReservedCell in foundAllReservedCell: cellVal = eachReservedCell.string; #print "cellVal=",cellVal; singleMonthDict['reservedMonthList'].append(cellVal); #print "singleMonthDict=",singleMonthDict; MonthDictList.append(singleMonthDict); #print str(i+1) + "="*79; print "Processed %d month's info"%(i+1); # generate output string generatedCalendarStr = generateOutputCalendar(MonthDictList); #print "generatedCalendarStr=",generatedCalendarStr; outputFileName = "generatedCalerdarString.txt"; print "Exporting generated calendar string into %s"%(outputFileName); outputFile = codecs.open(outputFileName, 'w', 'utf-8'); outputFile.write(generatedCalendarStr); outputFile.close(); print "Has exported calendar string into %s"%(outputFileName); # Note: # only makesure your expected output is somthing like: # {"start_date": "11/7/2012", # "end_date": "11/9/2012", # "status": "reserved" # }, # not : # {"start_date": 11/7/2012, # "end_date": 11/9/2012, # "status": reserved # }, # then I can use json to ouptut PRETTY-PRINTED dict string #------------------------------------------------------------------------------ def generateOutputCalendarJsonIndent(MonthDictList): jsonDumpsIndent = json.dumps(MonthDictList, indent=1); #print "jsonDumpsIndent=",jsonDumpsIndent; return jsonDumpsIndent; # json ouput demo demoDictList = MonthDictList[0:2]; jsonDumpsIndentStr = json.dumps(demoDictList, indent=1); outputFile_json_indent = "treeLikeWithIndentJsonString.txt"; outputFile_json_indent = codecs.open(outputFile_json_indent, 'w', 'utf-8'); outputFile_json_indent.write(jsonDumpsIndentStr); outputFile_json_indent.close(); #tttttttt generatedCalendarJsonIndentStr = generateOutputCalendarJsonIndent(demoDictList); print "type(generatedCalendarJsonIndentStr)=",type(generatedCalendarJsonIndentStr); #print "generatedCalendarJsonIndentStr=",generatedCalendarJsonIndentStr; outputFileName_json_indent = "generatedCalerdarString_json_indent.txt"; print "Exporting generated calendar string json indent into %s"%(outputFileName_json_indent); outputFile_json_indent = codecs.open(outputFileName_json_indent, 'w', 'utf-8'); outputFile_json_indent.write(generatedCalendarJsonIndentStr); outputFile_json_indent.close(); generatedCalendarJsonNoIndentStr = generateOutputCalendarJsonNoIndent(demoDictList); print "type(generatedCalendarJsonNoIndentStr)=",type(generatedCalendarJsonNoIndentStr); #print "generatedCalendarJsonNoIndentStr=",generatedCalendarJsonNoIndentStr; outputFileName_json_noIndent = "generatedCalerdarString_json_noIndent.txt"; print "Exporting generated calendar string json no indent into %s"%(outputFileName_json_noIndent); outputFile_json_noIndent = codecs.open(outputFileName_json_noIndent, 'w', 'utf-8'); outputFile_json_noIndent.write(generatedCalendarJsonNoIndentStr); outputFile_json_noIndent.close(); ############################################################################### if __name__=="__main__": main();
【总结】
转载请注明:在路上 » 【代码分享】Python代码:scrape_html_to_json – 从本地html中抓取信息导出为各种形式的json字符串