#!/usr/bin/python # -*- coding: utf-8 -*- """ 【作者】 www.crifan.com 【功能】 实现: http://topic.csdn.net/u/20120520/05/a029e241-22a1-4acf-aa58-b3e306e333f3.html 中的需求。 【说明】 请将如下内容: <?xml version="1.0" encoding="utf-8"?> <config> <ProductDetail brand="品胜" category="首页|电脑整机首页|电脑整机|笔记本配件" id="106537" model="S195B90" price="199.0" productID="189709" siteName="360buy" title="品胜笔记本电脑电源适配器S195B90 适用于索尼笔记本电脑" url="http://www.360buy.com/product/189709.html"/> <ProductDetail brand="维氏" category="首页|日用百货首页|礼品箱包|瑞士军刀" color="黄色" id="106538" model="救生员0.8623.MWN(荧光刀柄带尼龙刀套)" price="409.0" productID="189704" siteName="360buy" title="瑞士军刀救生员0.8623.MWN(荧光刀柄带尼龙刀套)" url="http://www.360buy.com/product/189704.html"/> <ProductDetail brand="" category="首页|日用百货首页|礼品箱包|中国军刀" color="黑色" id="106539" model="救生员0.9999.MWN(荧光刀柄带尼龙刀套)" price="499.0" productID="189705" siteName="360buy" title="瑞士军刀救生员0.9999.MWN(荧光刀柄带尼龙刀套)" url="http://www.360buy.com/product/189705.html"/> </config> 存为UTF-8格式的productConfig.xml,即可实现你的需求,输出三个文件了 """ import os; import re; import codecs; productListDict = {}; # store id:singleProductDict configFileName = "productConfig.xml"; brandFileName = "brands.txt"; titlesFileName = "titles.txt"; emptyBrandTitlesFileName = "emptyBrandTitlesFileName.txt"; print "input config file name is %s"%configFileName; cfgFile = codecs.open(configFileName, 'r', 'utf-8'); cfgUni = cfgFile.read(); print "cfgUni=",cfgUni; print "type(cfgUni)=",type(cfgUni); foundProductList = re.findall('<ProductDetail.*?category=".+?".+?id="\d+" model=".+?" price="\d+?\.\d+?" productID="\d+" siteName=".+?" title=".+?" url=".+?"/>', cfgUni); print "foundProductList=",foundProductList; print "len(foundProductList)=",len(foundProductList); if(foundProductList): for eachProduct in foundProductList: foundProductInfo = re.search('<ProductDetail( brand="(?P<brand>.*?)")? category="(?P<category>.+?)"( color="(?P<color>.+?)")? id="(?P<id>\d+)" model="(?P<model>.+?)" price="(?P<price>\d+?\.\d+?)" productID="(?P<productID>\d+)" siteName="(?P<siteName>.+?)" title="(?P<title>.+?)" url="(?P<url>.+?)"/>', eachProduct); print "foundProductInfo=",foundProductInfo; if(foundProductInfo): wholeItemStr = foundProductInfo.group(0); brand = foundProductInfo.group("brand"); category = foundProductInfo.group("category"); color = foundProductInfo.group("color"); id = foundProductInfo.group("id"); model = foundProductInfo.group("model"); price = foundProductInfo.group("price"); productID = foundProductInfo.group("productID"); siteName = foundProductInfo.group("siteName"); title = foundProductInfo.group("title"); url = foundProductInfo.group("url"); print "brand=%s,category=%s,color=%s,id=%s,model=%s,price=%s,productID=%s,siteName=%s,title=%s,url=%s," \ %(brand, category, color,id,model,price,productID,siteName,title,url); singleProductDict = { 'wholeItemStr': wholeItemStr, 'brand' : brand, 'category' : category, 'color' : color, 'id' : id, 'model' : model, 'price' : price, 'productID' : productID, 'siteName' : siteName, 'title' : title, 'url' : url, }; productListDict[id] = singleProductDict; brandFile = codecs.open(brandFileName, 'a+', 'utf-8'); productFile = codecs.open(titlesFileName, 'a+', 'utf-8'); emptyBrandTitlesFile= codecs.open(emptyBrandTitlesFileName, 'a+', 'utf-8'); for i,productId in enumerate(productListDict): print "---[%d]---"%i; singleProductDict = productListDict[productId]; brand = singleProductDict['brand']; print "brand=",brand; if(brand): brandFile.write(brand); brandFile.write("\r\n"); productFile.write(singleProductDict['title']); productFile.write("\r\n"); else: emptyBrandTitlesFile.write(singleProductDict['title']); emptyBrandTitlesFile.write("\r\n"); brandFile.close(); productFile.close(); emptyBrandTitlesFile.close();
转载请注明:在路上 » 随便写点python代码,实现产品信息解析并输出