【记录】写了个Python脚本去从qisuu网站下载电子书

全部内容已移至：
DownloadQisuuFile – 下载奇书网(qisuu.com)的电子书文件 v1.0
本来是去帮别人下载电子书的，需要手动一个个的点，烦死了，而且广告一堆。
索性，自己写了个脚本，去下载对应的电子书。
后来又反复修补bug，支持更多类型的下载地址等等，目前内容如下：
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
#!/usr/bin/python
# -*- coding: utf-8 -*-
"""
-------------------------------------------------------------------------------
Function:
download txt ebook from:
Type1: 奇书网 → 女频言情 → 穿越架空 → 电子书列表
http://www.qisuu.com/soft/sort03/sort039/list39_2.html
such as:
http://www.qisuu.com/Shtml27341.html
->
http://dzs.qisuu.com/2012121606.rar
also rename to its title: 《所遇非淑》全集.rar
 
Type2: 奇书网 → 武侠仙侠 → 电子书列表
http://www.qisuu.com/soft/sort02/list2_1.html
 
such as:
http://www.qisuu.com/Shtml27681.html
->
http://dzs.qisuu.com/2013020206.rar
also rename to its title: 《洪荒之证道不朽》全集.rar
 
TODO:
totalPageNum -> should extract out
settings support:
typeStartUrl
startPageNum
downloadFolderName -> WuXiaXianXia, ChuanYueJiaKong
 
eg:
download_qisuu_ebook.py -s http://www.qisuu.com/soft/sort03/sort039/list39_1.html -n 12 -d ChuanYueJiaKong
download_qisuu_ebook.py -s http://www.qisuu.com/soft/sort02/list2_1.html -d WuXiaXianXia
 
 
 
 
Version:    2013-02-04
Author:     Crifan Li
Contact:    admin@crifan.com
 
-------------------------------------------------------------------------------
"""
#--------------------------------const values-----------------------------------
 
gConst = {
 
};
 
gCfg = {
    'downloadFolder'    : None,
};
 
gVal = {
    'mainPreUrl'    : None,
};
 
#---------------------------------import---------------------------------------
import os;
import re;
import sys;
sys.path.append("libs");
from BeautifulSoup import BeautifulSoup,Tag,CData;
import crifanLib;
import logging;
import argparse;
 
# import urllib;
# import json;
# import csv;
# import codecs;
 
     
def main():
    newParser = argparse.ArgumentParser(description="Download (ebook) file from qisuu");
    newParser.add_argument("-s", "--startTypeUrl", dest="startTypeUrl", help="start url of type. eg: http://www.qisuu.com/soft/sort03/sort039/list39_1.html, http://www.qisuu.com/soft/sort02/list2_1.html");
    newParser.add_argument("-n", "--startPageNum", dest="startPageNum", type=int, default=1, help="start page number");
    newParser.add_argument("-d", "--downloadFolder", dest="downloadFolder", default="download", help="foler name to store downloaded files");
    args = newParser.parse_args();
    argsDict = args.__dict__;
    for eachArg in argsDict.keys():
        exec(eachArg + " = args." + eachArg);
 
    logging.info("startTypeUrl=%s", startTypeUrl);
    logging.info("startPageNum=%d", startPageNum);
    logging.info("downloadFolder=%s", downloadFolder);
 
    gConst['downloadFolder'] = downloadFolder;
 
    foundMainPrefUrl = re.search("(?P<mainPreUrl>http://www\.qisuu\.com/[\w/]+/list\d+_)\d+.html", startTypeUrl);
    logging.debug("foundMainPrefUrl=%s", foundMainPrefUrl);
    if(foundMainPrefUrl):
        mainPreUrl = foundMainPrefUrl.group("mainPreUrl");
        logging.info("mainPreUrl=%s", mainPreUrl);
        gVal['mainPreUrl'] = mainPreUrl;
    else:
        logging.error("Can Not found main prefix url from %s", startTypeUrl);
        sys.exit(-1);
 
    #init
    if(os.path.isdir(gConst['downloadFolder']) == False):
        os.makedirs(gConst['downloadFolder']);# create dir recursively
 
    #extract total page number
    respHtml = crifanLib.getUrlRespHtml(startTypeUrl);
    #logging.debug("respHtml=%s", respHtml);
    respHtmlUni = respHtml.decode("GBK", 'ignore');
    #       <td class="tablebody1">&nbsp;<a href="list39_72.html" title="尾页"><img border="0" src="/images/Last.gif" /></a>&nbsp;</td>
    foundTotalPageNum = re.search(u'<a\s+href="list\d+_(?P<totalPageNum>\d+).html"\s+title="尾页">', respHtmlUni);
    logging.debug("foundTotalPageNum=%s", foundTotalPageNum);
    if(foundTotalPageNum):
        totalPageNum = foundTotalPageNum.group("totalPageNum");
        logging.info("totalPageNum=%s", totalPageNum);
        totalPageNum = int(totalPageNum);
    else:
        logging.error("Can Not found total page number from %s resp html:\n%s", startTypeUrl, respHtml);
        sys.exit(-2);
 
         
    #for num in range(1, totalPageNum+1):
    for pageNum in range(startPageNum, totalPageNum+1):
        logging.info("============== page=%d ==============", pageNum);
        #http://www.qisuu.com/soft/sort03/sort039/list39_1.html
        #eachPageUrl = "http://www.qisuu.com/soft/sort03/sort039/list39_"+str(pageNum)+".html";
        eachPageUrl = gVal['mainPreUrl'] + str(pageNum) + ".html";
        logging.info("eachPageUrl=%s", eachPageUrl);
        pageRespHtml = crifanLib.getUrlRespHtml(eachPageUrl);
        #logging.debug("pageRespHtml=%s", pageRespHtml);
        # <div class="mainListInfo">
            # <div class="mainListName"><span class="mainSoftName"><a href="/Shtml27341.html" title="《所遇非淑》全集">《所遇非淑》全集</a></span></div><div class="mainListSize">2.06 MB</div><div class="mainListDate"><span class="oldDate"><span class="oldDate">2012-12-16</span></span></div><div class="mainListHist">Jar+TXT版</div>
        # </div>
        soup = BeautifulSoup(pageRespHtml, fromEncoding="GBK");
        foundAllMainList = soup.findAll(name="span", attrs={"class":"mainSoftName"});
        logging.debug("foundAllMainList=%s", foundAllMainList);
        mainListLen = len(foundAllMainList);
        logging.info("mainListLen=%s", mainListLen);
        for urlIdx,eachMainList in enumerate(foundAllMainList):
            urlNum = urlIdx + 1;
            logging.info("-------------- page=%d, url=%d --------------", pageNum, urlNum);
            logging.debug("eachMainList=%s", eachMainList);
            href = eachMainList.a['href'];
            logging.debug("href=%s", href);
            #http://www.qisuu.com/Shtml27667.html
            eachFileUrl = "http://www.qisuu.com" + href;
             
            logging.info("eachFileUrl=%s", eachFileUrl);
            fileRespHtml = crifanLib.getUrlRespHtml(eachFileUrl);
            #logging.debug("fileRespHtml=%s", fileRespHtml);
            soup = BeautifulSoup(fileRespHtml, fromEncoding="GBK");
             
            h1 = soup.h1.string;
            logging.info("h1=%s", h1);
            ebooName = h1 + ".rar";
             
            # <img src="/skin/newasp/download.gif"> <A oncontextmenu=ThunderNetwork_SetHref(this) onclick='return OnDownloadClick_Simple(this,2)' href='#' thunderResTitle='http://dzs.qisuu.com/2013012903.rar' thunderType='' thunderPid='02503' thunderHref='thunder://QUFodHRwOi8vZHpzLnFpc3V1LmNvbS8yMDEzMDEyOTAzLnJhclpa'class=downLinks>迅雷专用高速下载点</A><br><img src=/skin/newasp/download.gif> <A  href='http://dzs.qisuu.com/2013012903.rar'><strong>本站下载地址</strong></A>
            # </div></div>
            #foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>http://dzs\.qisuu\.com/\d+\.rar)'", fileRespHtml);
             
            #http://www.qisuu.com/Shtml22388.html
            #http://dzs.qisuu.com/tiansyiduity.rar
            #foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>http://dzs\.qisuu\.com/\w+\.rar)'", fileRespHtml);
             
            #http://www.qisuu.com/Shtml23411.html
            #<img src="/skin/newasp/download.gif"> <A oncontextmenu=ThunderNetwork_SetHref(this) onclick='return OnDownloadClick_Simple(this,2)' href='#' thunderResTitle='/soft/download.asp?softid=23411&downid=0&id=67531' thunderType='' thunderPid='02503' thunderHref='thunder://QUEvc29mdC9kb3dubG9hZC5hc3A/c29mdGlkPTIzNDExJmRvd25pZD0wJmlkPTY3NTMxWlo='class=downLinks>迅雷专用高速下载点</A><br><img src=/skin/newasp/download.gif> <A  href='/soft/download.asp?softid=23411&downid=0&id=67531'><strong>本站下载地址</strong></A>
            foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>[^']+)'", fileRespHtml);
             
            logging.debug("foundEbookAddress=%s", foundEbookAddress);
             
            if(foundEbookAddress):
                #http://dzs.qisuu.com/2013012903.rar
                #http://dzs.qisuu.com/tiansyiduity.rar
                ebookAddress = foundEbookAddress.group("ebookAddress");
                logging.info("ebookAddress=%s", ebookAddress);
                 
                if(re.match("/soft/download\.asp\?", ebookAddress)):
                    #find out real ebook address
                    #http://www.qisuu.com/Shtml23411.html
                    #->
                    #http://www.qisuu.com/soft/download.asp?softid=23411&downid=0&id=67531
                    #it allow download
                    #actually it will auto direct to:
                    #http://dl.wrshu.com:111/moqiqxdxz.rar
                    downloadAddress = "http://www.qisuu.com" + ebookAddress;
                    logging.info("downloadAddress=%s", downloadAddress);
                    fixedEbookAddress = downloadAddress;
                    logging.info("Found partial ebook address, so fix it to: %s", fixedEbookAddress);
                elif(re.match("http://dzs\.qisuu\.com/", ebookAddress)):
                    fixedEbookAddress = ebookAddress;
                else:
                    logging.error("Can Not recognize this kind of ebook download address %s", ebookAddress);
                    logging.debug("fileRespHtml=%s", fileRespHtml);
                    continue;
 
                #for
                #http://www.qisuu.com/Shtml26634.html
                #title is: 《神魔手下好当差/穿越之傀儡娃娃》全集
                ebookFullName = os.path.join(gConst['downloadFolder'], crifanLib.removeInvalidCharInFilename(ebooName, '_'));
                logging.info("dowloadinging ebookFullName=%s", ebookFullName);
                crifanLib.downloadFile(fixedEbookAddress, ebookFullName, True);
                #crifanLib.downloadFile(fixedEbookAddress, ebookFullName);
            else:
                logging.warning("Not found ebook address for url=%s", eachFileUrl);
                logging.debug("record its fileRespHtml=\n%s", fileRespHtml);
                 
                if(foundEbookAddress == None):
                    #http://www.qisuu.com/Shtml23542.html
                    logging.info(u"this url=%s may be: 此电子书已删除，暂不提供下载", eachFileUrl);
     
###############################################################################
if __name__=="__main__":
    scriptSelfName = crifanLib.extractFilename(sys.argv[0]);
 
    logging.basicConfig(
                    level    = logging.DEBUG,
                    format   = 'LINE %(lineno)-4d  %(levelname)-8s %(message)s',
                    datefmt  = '%m-%d %H:%M',
                    filename = scriptSelfName + ".log",
                    filemode = 'w');
    # define a Handler which writes INFO messages or higher to the sys.stderr
    console = logging.StreamHandler();
    console.setLevel(logging.INFO);
    # set a format which is simpler for console use
    formatter = logging.Formatter('LINE %(lineno)-4d : %(levelname)-8s %(message)s');
    # tell the handler to use this format
    console.setFormatter(formatter);
    logging.getLogger('').addHandler(console);
    try:
        main();
    except:
        logging.exception("Unknown Error !");
        raise;
与本文相关的文章

订阅在路上