【背景】
之前写的,去处理:
http://www.gcgis.org/webmappub/titleWF.aspx
http://www.greenvillecounty.org/vrealpr24/clRealProp.ASP?WCI=tplRealSearch&WCE=Form1&WCU=
等地址,
寻找到匹配的地图图片,然后提取相关域的信息,保存为excel文件。
【download_gcgis_map_pic代码分享】
1.截图:
(1)运行效果:
(2)保存信息为excel文件:
(3)下载的地图图片:
2.Python项目代码下载:
download_gcgis_map_pic_2012-11-13.7z
3.代码分享:
(1)download_gcgis_map_pic.py
#!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Function: Web Scrape 11-10-12 https://www.elance.com/j/web-scrape/35102090/ Version: 2012-11-13 Author: Crifan Li Contact: https://www.crifan.com/about/me/ ------------------------------------------------------------------------------- """ #---------------------------------import--------------------------------------- import re; import sys; sys.path.append("libs"); from BeautifulSoup import BeautifulSoup,Tag,CData; import crifanLib; import logging; import xlwt; def searchFromTwoTd(htmlToSearch, keyName, doHtmlDecode=True): foundValue = ""; foundTwoTd = re.search('<td\s+?style=".+?">'+str(keyName)+'</td>\s*?<td(\s+?style=".+?")?>(?P<foundValue>.+?)</td>', htmlToSearch); #print "foundTwoTd=",foundTwoTd; if(foundTwoTd): foundValue = foundTwoTd.group("foundValue"); #print "foundValue=",foundValue; foundValue = foundValue.strip(); #print "foundValue=",foundValue; if(foundValue and doHtmlDecode): #http://fredericiana.com/2010/10/08/decoding-html-entities-to-text-in-python/ foundValue = crifanLib.decodeHtmlEntity(foundValue, decodedEncoding="GBK"); #print "type(foundValue)=",type(foundValue); #print "after html decode, foundValue=",foundValue; return foundValue; def extractCommonInfo(searchRespHtml): # 1. Mail Addr # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;width:15%">Mail Addr</td> # <td style="width:35%">224 STONE LAKE DR </td> # foundMailAddr = re.search('<td style=".+?">Mail Addr</td>\s*?<td style=".+?">(?P<mailAddr>.+?)</td>', searchRespHtml); # print "foundMailAddr=",foundMailAddr; # if(foundMailAddr): # mailAddr = foundMailAddr.group("mailAddr"); # #print "mailAddr=",mailAddr; # mailAddr = mailAddr.strip(); # print "mailAddr=",mailAddr; mailAddr = searchFromTwoTd(searchRespHtml, "Mail Addr"); #print "mailAddr=",mailAddr; #224 STONE LAKE DR # 2. City # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">City</td> # <td>GREENVILLE </td> # foundCity = re.search('<td style=".+?">City</td>\s*?<td>(?P<city>.+?)</td>', searchRespHtml); # print "foundCity=",foundCity; # if(foundCity): # city = foundCity.group("city"); # city = city.strip(); # print "city=",city; city = searchFromTwoTd(searchRespHtml, "City"); #print "city=",city; #GREENVILLE # 3. Owner 1 # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Owner 1</td> # <td>HOOPER JOAN KIRKSEY </td> owner1 = searchFromTwoTd(searchRespHtml, "Owner 1"); #print "owner1=",owner1; #HOOPER JOAN KIRKSEY # 4. Owner 2 # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Owner 2</td> # <td> </td> owner2 = searchFromTwoTd(searchRespHtml, "Owner 2"); #print "owner2=",owner2; # # 5. Zip # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Zip</td> # <td>29609</td> zip = searchFromTwoTd(searchRespHtml, "Zip"); #print "zip=",zip; #29609 # 6. Desc # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Desc</td> # <td>1,PT2,7 </td> desc = searchFromTwoTd(searchRespHtml, "Desc"); #print "desc=",desc; #1,PT2,7 # 7. Loc # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Loc</td> # <td>707 GORDON ST EXT </td> loc = searchFromTwoTd(searchRespHtml, "Loc"); #print "loc=",loc; #707 GORDON ST EXT # 8. Acreage # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Acreage</td> # <td>0.49</td> acreage = searchFromTwoTd(searchRespHtml, "Acreage"); #print "acreage=",acreage; #0.49 # 9. Sq Footage # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;width:15%">Sq Footage</td> # <td style="width:13%">0</td> sqFootage = searchFromTwoTd(searchRespHtml, "Sq Footage"); #print "sqFootage=",sqFootage; #0 # 10. Deed Date # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Deed Date</td> # <td>05/11/2007</td> deedDate = searchFromTwoTd(searchRespHtml, "Deed Date"); #print "deedDate=",deedDate; #05/11/2007 # 11. Land Use # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Land Use</td> # <td>6800</td> landUse = searchFromTwoTd(searchRespHtml, "Land Use"); #print "landUse=",landUse; #6800 # 12. Fair Market Value # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Fair Market Value</td> # <td> </td> fairMarketValue = searchFromTwoTd(searchRespHtml, "Fair Market Value"); #print "fairMarketValue=",fairMarketValue; # # 13. Sales Price # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Sales Price</td> # <td> </td> salesPrice = searchFromTwoTd(searchRespHtml, "Sales Price"); #print "salesPrice=",salesPrice; # # 14. Taxable Market Value # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Taxable Market Value</td> # <td>$24,500</td> taxableMarketValue = searchFromTwoTd(searchRespHtml, "Taxable Market Value"); #print "taxableMarketValue=",taxableMarketValue; #$24,500 # 15. Num Bathrooms # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Num Bathrooms</td> # <td>0</td> numBathrooms = searchFromTwoTd(searchRespHtml, "Num Bathrooms"); #print "numBathrooms=",numBathrooms; #0 # 16. Num Bedrooms # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Num Bedrooms</td> # <td>0</td> numBedrooms = searchFromTwoTd(searchRespHtml, "Num Bedrooms"); #print "numBedrooms=",numBedrooms; #0 # 17. Total Rollback # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Total Rollback</td> # <td>$0.00</td> totalRollback = searchFromTwoTd(searchRespHtml, "Total Rollback"); #print "totalRollback=",totalRollback; #$0.00 # 18. Num Half Baths # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold">Num Half Baths</td> # <td>0</td> numHalfBaths = searchFromTwoTd(searchRespHtml, "Num Half Baths"); #print "numHalfBaths=",numHalfBaths; #0 # 19. Assmt Class # <td style="background-color:#004080;text-align:right;color:white;font-weight:bold;">Assmt Class</td> # <td> OT</td> assmtClass = searchFromTwoTd(searchRespHtml, "Assmt Class"); #print "assmtClass=",assmtClass; #OT commonInfoDict = { 'Owner1' : owner1, 'Owner2' : owner2, 'Acreage' : acreage, 'Mail Addr' : mailAddr, 'Mail City' : city, 'Mail Zip' : zip, 'Desc' : desc, 'Loc' : loc, 'Deed Date' : deedDate, 'Sale Price': salesPrice, 'LandUse' : landUse, 'Bath' : numBathrooms, 'Bed' : numBedrooms, 'Half Bath' : numHalfBaths, 'SqFt' : sqFootage, 'Fair Market Val' : fairMarketValue, 'Tax Val' : taxableMarketValue, 'RollBack' : totalRollback, 'Assmt Class' : assmtClass, }; return commonInfoDict; def extractOutstandingInfo(searchRespHtml): (yearsOutstanding, accountNo, amountOutstanding) = ("", "", ""); #original html: # <tr> # <td width="33%" bgcolor="#004080" align="middle" ><strong><font color="#ffffff"> Years # Outstanding</font></strong></td> # <td width="33%" bgcolor="#004080" align="middle" ><strong><font color="#ffffff"> Account # No</font></strong></td> # <td width="34%" bgcolor="#004080" align="right" ><strong><font color="#ffffff"> Amount # Outstanding</font></strong></td> # </tr></table></tr></table> # <table border='0' width="100%" cellspacing='0' cellpadding='0'> # <tr> # <td width='33%' bgcolor='#f5f5f5' align="center"> # 2011 # </td> # <td width='33%' bgcolor='#f5f5f5' align="center"> # 201100011448477001 # </td> # <td width='34%' bgcolor='#f5f5f5' align="right"> # $1,438.60 # </td> # </tr> # </table> # foundYearAccountAmount = re.search('<td .+?> Years\s*?Outstanding.+?</td>\s*?<td .+?> Account\s*?No.+?</td>\s*?<td .+?> Amount\s*?Outstanding.+?</td>.+?<td .+?>\s*?(?P<yearsOutstanding>\d+)\s*</td>\s*?<td .+?>\s*?(?P<accountNo>\d+)\s*</td>\s*?<td .+?>\s*?(?P<amountOutstanding>[$,\.\d]+?)\s*</td>', searchRespHtml, re.S); print "foundYearAccountAmount=",foundYearAccountAmount; if(foundYearAccountAmount): yearsOutstanding = foundYearAccountAmount.group("yearsOutstanding"); accountNo = foundYearAccountAmount.group("accountNo"); amountOutstanding = foundYearAccountAmount.group("amountOutstanding"); print "yearsOutstanding=%s, accountNo=%s, amountOutstanding=%s"%(yearsOutstanding, accountNo, amountOutstanding); return (yearsOutstanding, accountNo, amountOutstanding); def getHtmlByMapIdAndYear(mapId, year): #http://www.greenvillecounty.org/vrealpr24/clRealProp.ASP?WCI=tplRealSearch&WCE=Form1&WCU= searchPropertyUrl = "http://www.greenvillecounty.org/vrealpr24/clRealProp.ASP?WCI=tplRealSearch&WCE=Form1&WCU="; #post data: #SelectYear=2012&txt_Name=&txt_Street=&txt_MapNo=0230000400502&txt_Subdiv=&B1=Submit&txt_Voided_MApNo=&SelectSalesYear=ALL&txt_Sales_SheetNo= #SelectYear=2010&txt_Name=&txt_Street=&txt_MapNo=0230000400502&txt_Subdiv=&B1=Submit&txt_Voided_MApNo=&SelectSalesYear=ALL&txt_Sales_SheetNo= postData = { 'SelectYear' : str(year), 'txt_Name' : "", 'txt_Street' : "", 'txt_MapNo' : str(mapId), 'txt_Subdiv' : "", 'B1' : "Submit", 'txt_Voided_MApNo' : "", 'SelectSalesYear' : "ALL", 'txt_Sales_SheetNo' : "", }; searchRespHtml = crifanLib.getUrlRespHtml(searchPropertyUrl, postData); #print "searchRespHtml=",searchRespHtml; return searchRespHtml; def processEachMapId(mapId): searchRespHtml = getHtmlByMapIdAndYear(mapId, 2012); crifanLib.printCurrentCookies(); commonInfoDict = extractCommonInfo(searchRespHtml); (yearsOutstanding, accountNo, amountOutstanding) = extractOutstandingInfo(searchRespHtml); # infoDict = { # 'MapID' : "", # 'Owner1' : "", # 'Owner2' : "", # 'Acreage' : "", # 'Mail Addr' : "", # 'Mail City' : "", # 'Mail Zip' : "", # 'Desc' : "", # 'Loc' : "", # 'Deed Date' : "", # 'Sale Price': "", # 'LandUse' : "", # 'Bath' : "", # 'Bed' : "", # 'Half Bath' : "", # 'SqFt' : "", # 'Fair Market Val' : "", # 'Tax Val' : "", # 'RollBack' : "", # 'Assmt Class' : "", # '2012 Outstanding' : "", # '2011 Outstanding' : "", # '2010 Outstanding' : "", # }; # add for 2012 commonInfoDict['2012 Outstanding'] = amountOutstanding; for eachYear in [2010, 2011]: print "eachYear=",eachYear; eachYearRespHtml = getHtmlByMapIdAndYear(mapId, eachYear); #print "eachYear=%d, eachYearRespHtml=%s"%(eachYear, eachYearRespHtml); (yearsOutstanding, accountNo, amountOutstanding) = extractOutstandingInfo(eachYearRespHtml); crifanLib.printCurrentCookies(); commonInfoDict[str(eachYear)+' Outstanding'] = amountOutstanding; # finally add the mapId commonInfoDict["MapID"] = str(mapId); #commonInfoDict["MapID"] = mapId; print "commonInfoDict=",commonInfoDict; return commonInfoDict; def downloadMap(mapId, loc): print "mapId=%s, loc=%s"%(mapId, loc); # 1. get cookie: ASP.NET_SessionId titleWfUrl = "http://www.gcgis.org/webmappub/titleWF.aspx"; titleWfRespHtml = crifanLib.getUrlRespHtml(titleWfUrl); #print "titleWfRespHtml=",titleWfRespHtml; crifanLib.printCurrentCookies(); #now got Cookie ASP.NET_SessionId=fbh0kcewbftsszzn3pvzqm45 # 2. [11/77] to get __VIEWSTATE for later get pic url use postBackForGetPicUrl = "http://www.gcgis.org/webmappub/PostBack_WebForm.aspx"; postBackForGetPicUrlRespHtml = crifanLib.getUrlRespHtml(postBackForGetPicUrl); #<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35/qBgKW7cXfDgL5xd3gBQL0n+S1CQLZu8/YAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd+ZO6BwLtmaqoDAKt+fHFBgKL9J2oCQKD39/ZBQLl9OXbAgLP9/udCwLP94+8AwLP96OyBQLP97eoBwLP9+vVBwLP9//LCQLFtoCeBQK/+qWjBAKco4j/BALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk/dQLAp6frLMEAvHm8qAOAo3H37oJApS+ipsPn1bbnGxpyN+MEOuP9Zpsv+45KhQ=" /> foundEventValidationForGetPic = re.search('<input\s*?type="hidden"\s*?name="__EVENTVALIDATION"\s*?id="__EVENTVALIDATION"\s*?value="(?P<eventValidationForGetPic>.+?)"\s*?/>', postBackForGetPicUrlRespHtml); print "foundEventValidationForGetPic=",foundEventValidationForGetPic; if(foundEventValidationForGetPic): eventValidationForGetPic = foundEventValidationForGetPic.group("eventValidationForGetPic"); print "eventValidationForGetPic=",eventValidationForGetPic; #<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0=" /> foundViewStateForGetPic = re.search('<input\s*?type="hidden"\s*?name="__VIEWSTATE"\s*?id="__VIEWSTATE"\s*?value="(?P<viewStateForGetPic>.+?)"\s*?/>', postBackForGetPicUrlRespHtml); print "foundViewStateForGetPic=",foundViewStateForGetPic; if(foundViewStateForGetPic): viewStateForGetPic = foundViewStateForGetPic.group("viewStateForGetPic"); print "viewStateForGetPic=",viewStateForGetPic; # 3. [59/77] get pic url postBackUrl = "http://www.gcgis.org/webmappub/PostBack_WebForm.aspx"; # __VIEWSTATE=%2FwEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0%3D # __EVENTVALIDATION=%2FwEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35%2FqBgKW7cXfDgL5xd3gBQL0n%2BS1CQLZu8%2FYAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd%2BZO6BwLtmaqoDAKt%2BfHFBgKL9J2oCQKD39%2FZBQLl9OXbAgLP9%2FudCwLP94%2B8AwLP96OyBQLP97eoBwLP9%2BvVBwLP9%2F%2FLCQLFtoCeBQK%2F%2BqWjBAKco4j%2FBALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk%2FdQLAp6frLMEAvHm8qAOAo3H37oJApS%2BipsPn1bbnGxpyN%2BMEOuP9Zpsv%2B45KhQ%3D # govUser_hid=false # validUser_hid=true # accountInUse_hid= # command_hidden=zoomToStartExtent # mapImageHeight_hidden=431 # mapImageWidth_hidden=1088 # mapImageSrc_hidden= # xMinMap_hidden=0 # yMinMap_hidden=0 # xMaxMap_hidden=0 # yMaxMap_hidden=0 # x1_hidden=0 # y1_hidden=0 # x2_hidden=0 # y2_hidden=0 # layerIds_hid= # layersVisible_hid= # activeLayerId_hid= # vmlLayerID_hid=0 # vmlObjectID_hid=0 # pointXyText_hid= # vmlFeatureClass_hid= # vmlGeometry1_hid= # vmlGeometry2_hid= # vmlGeometry3_hid= # vmlGeometry4_hid= # vmlGeometry5_hid= # vmlGeometry6_hid= # aPoly_hid= # aLine_hid= # aCircle_hid= # aPoint_hid= # aText_hid= # numSelect_hid=0 # sQuery_hid= # addBuffer_hid=false # sBuffer_hid=0 # resultLayerId_hid= # compsMapList_hid=undefined # theme_hid= # errMsg_hid= postData = { '__VIEWSTATE' : viewStateForGetPic, '__EVENTVALIDATION' : eventValidationForGetPic, 'govUser_hid' : "false", 'validUser_hid' : "true", 'command_hidden' : "zoomToStartExtent", 'mapImageHeight_hidden' : "431", 'mapImageWidth_hidden' : "1088", 'xMinMap_hidden' : "0", 'yMinMap_hidden' : "0", 'xMaxMap_hidden' : "0", 'yMaxMap_hidden' : "0", 'x1_hidden' : "0", 'y1_hidden' : "0", 'x2_hidden' : "0", 'y2_hidden' : "0", 'vmlLayerID_hid' : "0", 'vmlObjectID_hid' : "0", 'numSelect_hid' : "0", 'sBuffer_hid' : "0", }; postBackUrlRespHtml = crifanLib.getUrlRespHtml(postBackUrl, postData); print "postBackUrlRespHtml=",postBackUrlRespHtml; #resp html contain: # <input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0=" /> # <input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35/qBgKW7cXfDgL5xd3gBQL0n+S1CQLZu8/YAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd+ZO6BwLtmaqoDAKt+fHFBgKL9J2oCQKD39/ZBQLl9OXbAgLP9/udCwLP94+8AwLP96OyBQLP97eoBwLP9+vVBwLP9//LCQLFtoCeBQK/+qWjBAKco4j/BALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk/dQLAp6frLMEAvHm8qAOAo3H37oJApS+ipsPn1bbnGxpyN+MEOuP9Zpsv+45KhQ=" /> # <input type="hidden" name="govUser_hid" id="govUser_hid" value="False" /> # <input type="hidden" name="validUser_hid" id="validUser_hid" value="True" /> # <input type="hidden" name="accountInUse_hid" id="accountInUse_hid" /> # <input type="hidden" name="command_hidden" id="command_hidden" value="zoomToStartExtent" /> # <input type="hidden" name="mapImageHeight_hidden" id="mapImageHeight_hidden" value="431" /> # <input type="hidden" name="mapImageWidth_hidden" id="mapImageWidth_hidden" value="1088" /> # <input type="hidden" name="mapImageSrc_hidden" id="mapImageSrc_hidden" value="http://www.gcgis.org/output/webmappub_zs-gisims130202756373.jpg" /> # <input type="hidden" name="xMinMap_hidden" id="xMinMap_hidden" value="1210744.77958237" /> # <input type="hidden" name="yMinMap_hidden" id="yMinMap_hidden" value="958500" /> # <input type="hidden" name="xMaxMap_hidden" id="xMaxMap_hidden" value="1911255.22041763" /> # <input type="hidden" name="yMaxMap_hidden" id="yMaxMap_hidden" value="1236000" /> # <input type="hidden" name="x1_hidden" id="x1_hidden" value="0" /> # <input type="hidden" name="y1_hidden" id="y1_hidden" value="0" /> # <input type="hidden" name="x2_hidden" id="x2_hidden" value="0" /> # <input type="hidden" name="y2_hidden" id="y2_hidden" value="0" /> # <input type="hidden" name="layerIds_hid" id="layerIds_hid" /> # <input type="hidden" name="layersVisible_hid" id="layersVisible_hid" /> # <input type="hidden" name="activeLayerId_hid" id="activeLayerId_hid" value="25" /> # <input type="hidden" name="vmlLayerID_hid" id="vmlLayerID_hid" value="0" /> # <input type="hidden" name="vmlObjectID_hid" id="vmlObjectID_hid" value="0" /> # <input type="hidden" name="pointXyText_hid" id="pointXyText_hid" /> # <input type="hidden" name="vmlFeatureClass_hid" id="vmlFeatureClass_hid" /> # <input type="hidden" name="vmlGeometry1_hid" id="vmlGeometry1_hid" /> # <input type="hidden" name="vmlGeometry2_hid" id="vmlGeometry2_hid" /> # <input type="hidden" name="vmlGeometry3_hid" id="vmlGeometry3_hid" /> # <input type="hidden" name="vmlGeometry4_hid" id="vmlGeometry4_hid" /> # <input type="hidden" name="vmlGeometry5_hid" id="vmlGeometry5_hid" /> # <input type="hidden" name="vmlGeometry6_hid" id="vmlGeometry6_hid" /> # <input type="hidden" name="aPoly_hid" id="aPoly_hid" /> # <input type="hidden" name="aLine_hid" id="aLine_hid" /> # <input type="hidden" name="aCircle_hid" id="aCircle_hid" /> # <input type="hidden" name="aPoint_hid" id="aPoint_hid" /> # <input type="hidden" name="aText_hid" id="aText_hid" /> # <input type="hidden" name="numSelect_hid" id="numSelect_hid" value="0" /> # <input type="hidden" name="sQuery_hid" id="sQuery_hid" /> # <input type="hidden" name="addBuffer_hid" id="addBuffer_hid" value="false" /> # <input type="hidden" name="sBuffer_hid" id="sBuffer_hid" value="0" /> # <input type="hidden" name="resultLayerId_hid" id="resultLayerId_hid" /> # <input type="hidden" name="compsMapList_hid" id="compsMapList_hid" value="undefined" /> # <input type="hidden" name="theme_hid" id="theme_hid" /> # <input type="hidden" name="errMsg_hid" id="errMsg_hid" /> gisImgUrl = ""; foundGisImgUrl = re.search('<input\s*?type="hidden"\s*?name="mapImageSrc_hidden"\s*?id="mapImageSrc_hidden"\s*?value="(?P<gisImgUrl>.+?)"\s*?/>', postBackUrlRespHtml); print "foundGisImgUrl=",foundGisImgUrl; if(foundGisImgUrl): gisImgUrl = foundGisImgUrl.group("gisImgUrl"); print "gisImgUrl=",gisImgUrl; getImgName = gisImgUrl.split("/")[-1]; print "getImgName=",getImgName; crifanLib.manuallyDownloadFile(gisImgUrl, getImgName); # 4. [61/77] find __EVENTVALIDATION and __VIEWSTATE findLocUrl = "http://www.gcgis.org/webmappub/find.aspx?govUser=false&validUser=true"; getFindLocUrlRespHtml = crifanLib.getUrlRespHtml(findLocUrl); #<input type="hidden" name="__EVENTVALIDATION" id="__EVENTVALIDATION" value="/wEWCQKbucpfAum77Y8DAr66s9ELAv2YvMEJAqHW85sKAu2ZqqgMAoCf7Z8EAq358cUGAov0nagJEOHv/rftLPy2jxNqDJgxJaj/dyE=" /> foundEventValidation = re.search('<input\s*?type="hidden"\s*?name="__EVENTVALIDATION"\s*?id="__EVENTVALIDATION"\s*?value="(?P<eventValidation>.+?)"\s*?/>', getFindLocUrlRespHtml); print "foundEventValidation=",foundEventValidation; if(foundEventValidation): eventValidation = foundEventValidation.group("eventValidation"); print "eventValidation=",eventValidation; #<input type="hidden" name="__VIEWSTATE" id="__VIEWSTATE" value="/wEPDwUKMTUzMjUxNjgxMg9kFgICAw9kFgICAQ9kFgJmD2QWAgIBDw8WAh4EVGV4dGRkZBgBBQ5maW5kX011bHRpVmlldw8PZGZkyFe0rX+rEa+u5GulCVJ7mrLSWG8=" /> foundViewState = re.search('<input\s*?type="hidden"\s*?name="__VIEWSTATE"\s*?id="__VIEWSTATE"\s*?value="(?P<viewState>.+?)"\s*?/>', getFindLocUrlRespHtml); print "foundViewState=",foundViewState; if(foundViewState): viewState = foundViewState.group("viewState"); print "viewState=",viewState; # 5. [66/77] do search # __EVENTTARGET= # __EVENTARGUMENT= # __VIEWSTATE=%2FwEPDwUKMTUzMjUxNjgxMg9kFgICAw9kFgICAQ9kFgJmD2QWAgIBDw8WAh4EVGV4dGRkZBgBBQ5maW5kX011bHRpVmlldw8PZGZkyFe0rX%2BrEa%2Bu5GulCVJ7mrLSWG8%3D # __EVENTVALIDATION=%2FwEWCQKbucpfAum77Y8DAr66s9ELAv2YvMEJAqHW85sKAu2ZqqgMAoCf7Z8EAq358cUGAov0nagJEOHv%2FrftLPy2jxNqDJgxJaj%2FdyE%3D # find_TextBox=707++GORDON+ST+EXT # find_Button=Search # govUser_hid=false # validUser_hid=true # activeLayerId_hid=25 # findText_hid= # vmlLayerID_hid= # vmlObjectID_hid= #findTextBox = urllib.quote_plus(loc); findTextBox = loc; print "findTextBox=",findTextBox; postData = { #'__EVENTTARGET' : "", #'__EVENTARGUMENT' : "", '__VIEWSTATE' : viewState, #%2FwEPDwUKMTUzMjUxNjgxMg9kFgICAw9kFgICAQ9kFgJmD2QWAgIBDw8WAh4EVGV4dGRkZBgBBQ5maW5kX011bHRpVmlldw8PZGZkyFe0rX%2BrEa%2Bu5GulCVJ7mrLSWG8%3D '__EVENTVALIDATION' : eventValidation, #%2FwEWCQKbucpfAum77Y8DAr66s9ELAv2YvMEJAqHW85sKAu2ZqqgMAoCf7Z8EAq358cUGAov0nagJEOHv%2FrftLPy2jxNqDJgxJaj%2FdyE%3D 'find_TextBox' : findTextBox, #707++GORDON+ST+EXT 'find_Button' : "Search", 'govUser_hid' : "false", 'validUser_hid' : "true", 'activeLayerId_hid' : "25", #"findText_hid" : "", #"vmlLayerID_hid" : "", #"vmlObjectID_hid" : "", }; headerDict = { "Referer" : "http://www.gcgis.org/webmappub/find.aspx?govUser=false&validUser=true", }; postFindLocUrlRespHtml = crifanLib.getUrlRespHtml(findLocUrl, postData, headerDict); print "postFindLocUrlRespHtml=",postFindLocUrlRespHtml; # [76/77] get real pic #postBackUrl = "http://www.gcgis.org/webmappub/PostBack_WebForm.aspx"; # __VIEWSTATE=%2FwEPDwUKLTY4MjA2ODAyNmRk95maMKre9V09JJjbi9r8vCJQ1m0%3D # __EVENTVALIDATION=%2FwEWKgLzgKHkDAL9mLzBCQKh1vObCgLe35%2FqBgKW7cXfDgL5xd3gBQL0n%2BS1CQLZu8%2FYAgKm7eu0CAKF5em0CAKD8ZLwBgLi6JDwBgL5qq6zBgK2tK6zBgL5qqrTBgK2tKrTBgKD9sPKDQLd%2BZO6BwLtmaqoDAKt%2BfHFBgKL9J2oCQKD39%2FZBQLl9OXbAgLP9%2FudCwLP94%2B8AwLP96OyBQLP97eoBwLP9%2BvVBwLP9%2F%2FLCQLFtoCeBQK%2F%2BqWjBAKco4j%2FBALOs5TJCALCspqrDAL2t82UCAKCjbz2BgKojfk8AqGk%2FdQLAp6frLMEAvHm8qAOAo3H37oJApS%2BipsPn1bbnGxpyN%2BMEOuP9Zpsv%2B45KhQ%3D # govUser_hid=false # validUser_hid=true # accountInUse_hid= # command_hidden=zoomToFeature # mapImageHeight_hidden=431 # mapImageWidth_hidden=1088 # mapImageSrc_hidden=http%3A%2F%2Fwww.gcgis.org%2Foutput%2Fwebmappub_zs-gisims130202756373.jpg # xMinMap_hidden=1210744.77958237 # yMinMap_hidden=958500 # xMaxMap_hidden=1911255.22041763 # yMaxMap_hidden=1236000 # x1_hidden=2000 # y1_hidden=19661 # x2_hidden=0 # y2_hidden=0 # layerIds_hid= # layersVisible_hid= # activeLayerId_hid=25 # vmlLayerID_hid=2000 # vmlObjectID_hid=19661 # pointXyText_hid= # vmlFeatureClass_hid= # vmlGeometry1_hid= # vmlGeometry2_hid= # vmlGeometry3_hid= # vmlGeometry4_hid= # vmlGeometry5_hid= # vmlGeometry6_hid= # aPoly_hid= # aLine_hid= # aCircle_hid= # aPoint_hid= # aText_hid= # numSelect_hid=0 # sQuery_hid= # addBuffer_hid=false # sBuffer_hid=0 # resultLayerId_hid= # compsMapList_hid=undefined # theme_hid= # errMsg_hid= postData = { '__VIEWSTATE' : viewStateForGetPic, '__EVENTVALIDATION' : eventValidationForGetPic, 'govUser_hid' : "false", 'validUser_hid' : "true", 'command_hidden' : "zoomToFeature", 'mapImageHeight_hidden' : "431", 'mapImageWidth_hidden' : "1088", 'mapImageSrc_hidden' : gisImgUrl, 'xMinMap_hidden' : "1210744.77958237", 'yMinMap_hidden' : "958500", 'xMaxMap_hidden' : "1911255.22041763", 'yMaxMap_hidden' : "1236000", 'x1_hidden' : "2000", 'y1_hidden' : "19661", 'x2_hidden' : "0", 'y2_hidden' : "0", 'activeLayerId_hid' : "25", 'vmlLayerID_hid' : "2000", 'vmlObjectID_hid' : "19661", 'numSelect_hid' : "0", 'sBuffer_hid' : "0", 'addBuffer_hid' : "false", 'compsMapList_hid' : "undefined", }; postBackUrlForGetPicRespHtml = crifanLib.getUrlRespHtml(postBackUrl, postData); print "postBackUrlForGetPicRespHtml=",postBackUrlForGetPicRespHtml; if(gisImgUrl): #http://www.gcgis.org/output/webmappub_zs-gisims130202756373.jpg #download second time, this time, this pic is what we real want getImgName = gisImgUrl.split("/")[-1]; print "getImgName=",getImgName; #webmappub_zs-gisims130202756373.jpg realImgName = "real_" + getImgName; print "realImgName=",realImgName; crifanLib.manuallyDownloadFile(gisImgUrl, realImgName); print "Download real pic OK"; ddddddddd return ; def outputInfoDictList(allInfoDictList): #init output excel file excelFilename = "extractedRealPropertyInfo.xls"; #https://groups.google.com/forum/?fromgroups=#!topic/python-excel/8kCUw2y8PrU # badBG = xlwt.Pattern(); # badBG.SOLID_PATTERN = 0x34 # badBG.NO_PATTERN = 0x34 # badBG.pattern_fore_colour = 0x34 # badBG.pattern_back_colour = 0x34 # badFontStyle = xlwt.XFStyle() # badFontStyle.Pattern = badBG # sheet1.write(1,1,'hello world', badFontStyle) #https://github.com/python-excel/xlwt/blob/master/xlwt/Cell.py #not find background color #https://github.com/python-excel/xlwt/blob/master/xlwt/Formatting.py #blueBackgroundPattern = xlwt.Pattern(); #blueBackgroundPattern.pattern_back_colour = 0x34; #blueBackgroundPattern.SOLID_PATTERN = 0x34 #blueBackgroundPattern.NO_PATTERN = 0x34 #blueBackgroundPattern.pattern_fore_colour = "red" #blueBackgroundPattern.pattern_back_colour = "blue"; # #https://groups.google.com/forum/?fromgroups=#!topic/python-excel/8kCUw2y8PrU # badBG = xlwt.Pattern() # badBG.pattern = badBG.SOLID_PATTERN # #badBG.pattern_fore_colour = 3 # #badBG.pattern_fore_colour = "blue"; # badBG.pattern_fore_colour = 3; # badFontStyle = xlwt.XFStyle() # badFontStyle.pattern = badBG; # styleBlueBkg = badFontStyle; #styleBlueBkg = xlwt.easyxf('font: color-index red, bold on'); #styleBlueBkg = xlwt.easyxf('font: background-color-index red, bold on'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour red;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour blue;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour light_blue;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour pale_blue;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour dark_blue;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour dark_blue_ega;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour ice_blue;'); styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour sky_blue;'); #styleBlueBkg = xlwt.easyxf('pattern: pattern solid, fore_colour ocean_blue; font: bold on;'); # 80% like #blueBkgFontStyle = xlwt.XFStyle() #blueBkgFontStyle.Pattern = blueBackgroundPattern; #styleBlueBkg = blueBkgFontStyle; styleBold = xlwt.easyxf('font: bold on'); wb = xlwt.Workbook(); ws = wb.add_sheet('realPropertyInfo'); #write header # infoDict = { # Sequence # 'MapID' : "", # 'Owner1' : "", # 'Owner2' : "", # 'Acreage' : "", # 'Mail Addr' : "", # 'Mail City' : "", # 'Mail Zip' : "", # 'Desc' : "", # 'Loc' : "", # 'Deed Date' : "", # 'Sale Price': "", # 'LandUse' : "", # 'Bath' : "", # 'Bed' : "", # 'Half Bath' : "", # 'SqFt' : "", # 'Fair Market Val' : "", # 'Tax Val' : "", # 'RollBack' : "", # 'Assmt Class' : "", # '2012 Outstanding' : "", # '2011 Outstanding' : "", # '2010 Outstanding' : "", # }; ws.write(0, 0, "Sequence", styleBlueBkg); ws.write(0, 1, "MapID", styleBlueBkg); ws.write(0, 2, "Owner1", styleBold); ws.write(0, 3, "Owner2", styleBold); ws.write(0, 4, "Acreage", styleBold); ws.write(0, 5, "Mail Addr", styleBold); ws.write(0, 6, "Mail City", styleBold); ws.write(0, 7, "Mail Zip", styleBold); ws.write(0, 8, "Desc", styleBold); ws.write(0, 9, "Loc", styleBold); ws.write(0, 10, "Deed Date",styleBold); ws.write(0, 11, "Sale Price", styleBold); ws.write(0, 12, "LandUse", styleBold); ws.write(0, 13, "Bath", styleBold); ws.write(0, 14, "Bed", styleBold); ws.write(0, 15, "Half Bath",styleBold); ws.write(0, 16, "SqFt", styleBold); ws.write(0, 17, "Fair Market Val", styleBold); ws.write(0, 18, "Tax Val", styleBold); ws.write(0, 19, "RollBack", styleBold); ws.write(0, 20, "Assmt Class", styleBold); ws.write(0, 21, "2012 Outstanding", styleBold); ws.write(0, 22, "2011 Outstanding", styleBold); ws.write(0, 23, "2010 Outstanding", styleBold); #output extracted info print "Outputing extracted info to excel file ",excelFilename; for index,eachInfoDict in enumerate(allInfoDictList): number = index + 1; numberStr = str(number); #eachInfoDict['Sequence'] = numberStr; #ws.write(number, 0, eachInfoDict['Sequence']); ws.write(number, 0, numberStr); mapId = eachInfoDict['MapID']; print "mapId=",mapId; ws.write(number, 1, mapId); ws.write(number, 2, eachInfoDict['Owner1']); ws.write(number, 3, eachInfoDict['Owner2']); ws.write(number, 4, eachInfoDict['Acreage']); ws.write(number, 5, eachInfoDict['Mail Addr']); ws.write(number, 6, eachInfoDict['Mail City']); ws.write(number, 7, eachInfoDict['Mail Zip']); ws.write(number, 8, eachInfoDict['Desc']); loc = eachInfoDict['Loc']; print "loc=",loc; ws.write(number, 9, loc); ws.write(number, 10, eachInfoDict['Deed Date']); ws.write(number, 11, eachInfoDict['Sale Price']); ws.write(number, 12, eachInfoDict['LandUse']); ws.write(number, 13, eachInfoDict['Bath']); ws.write(number, 14, eachInfoDict['Bed']); ws.write(number, 15, eachInfoDict['Half Bath']); ws.write(number, 16, eachInfoDict['SqFt']); ws.write(number, 17, eachInfoDict['Fair Market Val']); ws.write(number, 18, eachInfoDict['Tax Val']); ws.write(number, 19, eachInfoDict['RollBack']); ws.write(number, 20, eachInfoDict['Assmt Class']); ws.write(number, 21, eachInfoDict['2012 Outstanding']); ws.write(number, 22, eachInfoDict['2011 Outstanding']); ws.write(number, 23, eachInfoDict['2010 Outstanding']); #fetch map downloadMap(mapId, loc); wb.save(excelFilename); return; def main(): crifanLib.initAutoHandleCookies(); allInfoDictList = []; mapIdList = [ "0230000400502", "0230000300400", "0230000509400", ]; for eachMapId in mapIdList: singleInfoDict = processEachMapId(eachMapId); allInfoDictList.append(singleInfoDict); outputInfoDictList(allInfoDictList); ############################################################################### if __name__=="__main__": scriptSelfName = crifanLib.extractFilename(sys.argv[0]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s', datefmt = '%m-%d %H:%M', filename = scriptSelfName + ".log", filemode = 'w'); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s'); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try: main(); except: logging.exception("Unknown Error !"); raise;
【总结】
转载请注明:在路上 » 【代码分享】Python代码:download_gcgis_map_pic – 从gcgis.org中的地图图片中提取信息并保存到excel文件