你的位置：在路上 > 工作和技术 > 随便写点python代码，实现产品信息解析并输出

随便写点python代码，实现产品信息解析并输出

工作和技术 crifan 13年前 (2012-07-24) 2174浏览 0评论

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
【作者】 www.crifan.com
【功能】
实现：
http://topic.csdn.net/u/20120520/05/a029e241-22a1-4acf-aa58-b3e306e333f3.html
中的需求。
【说明】
请将如下内容:
<?xml version="1.0" encoding="utf-8"?>
 
<config>
    <ProductDetail brand="品胜" category="首页|电脑整机首页|电脑整机|笔记本配件" id="106537" model="S195B90" price="199.0" productID="189709" siteName="360buy" title="品胜笔记本电脑电源适配器S195B90 适用于索尼笔记本电脑" url="http://www.360buy.com/product/189709.html"/>
     
    <ProductDetail brand="维氏" category="首页|日用百货首页|礼品箱包|瑞士军刀" color="黄色" id="106538" model="救生员0.8623.MWN(荧光刀柄带尼龙刀套)" price="409.0" productID="189704" siteName="360buy" title="瑞士军刀救生员0.8623.MWN(荧光刀柄带尼龙刀套)" url="http://www.360buy.com/product/189704.html"/>
     
    <ProductDetail brand="" category="首页|日用百货首页|礼品箱包|中国军刀" color="黑色" id="106539" model="救生员0.9999.MWN(荧光刀柄带尼龙刀套)" price="499.0" productID="189705" siteName="360buy" title="瑞士军刀救生员0.9999.MWN(荧光刀柄带尼龙刀套)" url="http://www.360buy.com/product/189705.html"/>
</config>
 
存为UTF-8格式的productConfig.xml，即可实现你的需求，输出三个文件了
"""
 
import os;
import re;
import codecs;
 
productListDict = {}; # store id:singleProductDict
 
configFileName = "productConfig.xml";
 
brandFileName = "brands.txt";
titlesFileName = "titles.txt";
emptyBrandTitlesFileName = "emptyBrandTitlesFileName.txt";
 
print "input config file name is %s"%configFileName;
cfgFile = codecs.open(configFileName, 'r', 'utf-8');
cfgUni = cfgFile.read();
print "cfgUni=",cfgUni;
print "type(cfgUni)=",type(cfgUni);
foundProductList = re.findall('<ProductDetail.*?category=".+?".+?id="\d+" model=".+?" price="\d+?\.\d+?" productID="\d+" siteName=".+?" title=".+?" url=".+?"/>', cfgUni);
print "foundProductList=",foundProductList;
print "len(foundProductList)=",len(foundProductList);
if(foundProductList):
    for eachProduct in foundProductList:
        foundProductInfo = re.search('<ProductDetail( brand="(?P<brand>.*?)")? category="(?P<category>.+?)"( color="(?P<color>.+?)")? id="(?P<id>\d+)" model="(?P<model>.+?)" price="(?P<price>\d+?\.\d+?)" productID="(?P<productID>\d+)" siteName="(?P<siteName>.+?)" title="(?P<title>.+?)" url="(?P<url>.+?)"/>', eachProduct);
        print "foundProductInfo=",foundProductInfo;
        if(foundProductInfo):
            wholeItemStr = foundProductInfo.group(0);
             
            brand = foundProductInfo.group("brand");
            category = foundProductInfo.group("category");
            color = foundProductInfo.group("color");
            id = foundProductInfo.group("id");
            model = foundProductInfo.group("model");
            price = foundProductInfo.group("price");
            productID = foundProductInfo.group("productID");
            siteName = foundProductInfo.group("siteName");
            title = foundProductInfo.group("title");
            url = foundProductInfo.group("url");
            print "brand=%s,category=%s,color=%s,id=%s,model=%s,price=%s,productID=%s,siteName=%s,title=%s,url=%s," \
                %(brand, category, color,id,model,price,productID,siteName,title,url);
             
            singleProductDict = {
                'wholeItemStr': wholeItemStr,
                'brand'     : brand,
                'category'  : category,
                'color'     : color,
                'id'        : id,
                'model'     : model,
                'price'     : price,
                'productID' : productID,
                'siteName'  : siteName,
                'title'     : title,
                'url'       : url,
            };
             
            productListDict[id] = singleProductDict;
     
     
    brandFile           = codecs.open(brandFileName, 'a+', 'utf-8');
    productFile         = codecs.open(titlesFileName, 'a+', 'utf-8');
    emptyBrandTitlesFile= codecs.open(emptyBrandTitlesFileName, 'a+', 'utf-8');
     
    for i,productId in enumerate(productListDict):
        print "---[%d]---"%i;
        singleProductDict = productListDict[productId];
        brand = singleProductDict['brand'];
        print "brand=",brand;
        if(brand):
            brandFile.write(brand);
            brandFile.write("\r\n");
             
            productFile.write(singleProductDict['title']);
            productFile.write("\r\n");
        else:
            emptyBrandTitlesFile.write(singleProductDict['title']);
            emptyBrandTitlesFile.write("\r\n");
     
    brandFile.close();
    productFile.close();
    emptyBrandTitlesFile.close();

转载请注明：在路上 » 随便写点python代码，实现产品信息解析并输出

Post Views: 899

与本文相关的文章

分类目录

82 queries in 0.488 seconds, using 22.18MB memory