全部内容已移至:
DownloadQisuuFile – 下载奇书网(qisuu.com)的电子书文件 v1.0
本来是去帮别人下载电子书的,需要手动一个个的点,烦死了,而且广告一堆。
索性,自己写了个脚本,去下载对应的电子书。
后来又反复修补bug,支持更多类型的下载地址等等,目前内容如下:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | #!/usr/bin/python # -*- coding: utf-8 -*- """ ------------------------------------------------------------------------------- Function: download txt ebook from: Type1: 奇书网 → 女频言情 → 穿越架空 → 电子书列表 such as: -> also rename to its title: 《所遇非淑》全集.rar Type2: 奇书网 → 武侠仙侠 → 电子书列表 such as: -> also rename to its title: 《洪荒之证道不朽》全集.rar TODO: totalPageNum -> should extract out settings support: typeStartUrl startPageNum downloadFolderName -> WuXiaXianXia, ChuanYueJiaKong eg: download_qisuu_ebook.py -s http://www.qisuu.com/soft/sort03/sort039/list39_1.html -n 12 -d ChuanYueJiaKong download_qisuu_ebook.py -s http://www.qisuu.com/soft/sort02/list2_1.html -d WuXiaXianXia Version: 2013-02-04 Author: Crifan Li Contact: admin@crifan.com ------------------------------------------------------------------------------- """ #--------------------------------const values----------------------------------- gConst = { }; gCfg = { 'downloadFolder' : None , }; gVal = { 'mainPreUrl' : None , }; #---------------------------------import--------------------------------------- import os; import re; import sys; sys.path.append( "libs" ); from BeautifulSoup import BeautifulSoup,Tag,CData; import crifanLib; import logging; import argparse; # import urllib; # import json; # import csv; # import codecs; def main(): newParser = argparse.ArgumentParser(description = "Download (ebook) file from qisuu" ); newParser.add_argument( "-s" , "--startTypeUrl" , dest = "startTypeUrl" , help = "start url of type. eg: http://www.qisuu.com/soft/sort03/sort039/list39_1.html, http://www.qisuu.com/soft/sort02/list2_1.html" ); newParser.add_argument( "-n" , "--startPageNum" , dest = "startPageNum" , type = int , default = 1 , help = "start page number" ); newParser.add_argument( "-d" , "--downloadFolder" , dest = "downloadFolder" , default = "download" , help = "foler name to store downloaded files" ); args = newParser.parse_args(); argsDict = args.__dict__; for eachArg in argsDict.keys(): exec (eachArg + " = args." + eachArg); logging.info( "startTypeUrl=%s" , startTypeUrl); logging.info( "startPageNum=%d" , startPageNum); logging.info( "downloadFolder=%s" , downloadFolder); gConst[ 'downloadFolder' ] = downloadFolder; foundMainPrefUrl = re.search( "(?P<mainPreUrl>http://www\.qisuu\.com/[\w/]+/list\d+_)\d+.html" , startTypeUrl); logging.debug( "foundMainPrefUrl=%s" , foundMainPrefUrl); if (foundMainPrefUrl): mainPreUrl = foundMainPrefUrl.group( "mainPreUrl" ); logging.info( "mainPreUrl=%s" , mainPreUrl); gVal[ 'mainPreUrl' ] = mainPreUrl; else : logging.error( "Can Not found main prefix url from %s" , startTypeUrl); sys.exit( - 1 ); #init if (os.path.isdir(gConst[ 'downloadFolder' ]) = = False ): os.makedirs(gConst[ 'downloadFolder' ]); # create dir recursively #extract total page number respHtml = crifanLib.getUrlRespHtml(startTypeUrl); #logging.debug("respHtml=%s", respHtml); respHtmlUni = respHtml.decode( "GBK" , 'ignore' ); # <td class="tablebody1"> <a href="list39_72.html" title="尾页"><img border="0" src="/images/Last.gif" /></a> </td> foundTotalPageNum = re.search(u '<a\s+href="list\d+_(?P<totalPageNum>\d+).html"\s+title="尾页">' , respHtmlUni); logging.debug( "foundTotalPageNum=%s" , foundTotalPageNum); if (foundTotalPageNum): totalPageNum = foundTotalPageNum.group( "totalPageNum" ); logging.info( "totalPageNum=%s" , totalPageNum); totalPageNum = int (totalPageNum); else : logging.error( "Can Not found total page number from %s resp html:\n%s" , startTypeUrl, respHtml); sys.exit( - 2 ); #for num in range(1, totalPageNum+1): for pageNum in range (startPageNum, totalPageNum + 1 ): logging.info( "============== page=%d ==============" , pageNum); #eachPageUrl = "http://www.qisuu.com/soft/sort03/sort039/list39_"+str(pageNum)+".html"; eachPageUrl = gVal[ 'mainPreUrl' ] + str (pageNum) + ".html" ; logging.info( "eachPageUrl=%s" , eachPageUrl); pageRespHtml = crifanLib.getUrlRespHtml(eachPageUrl); #logging.debug("pageRespHtml=%s", pageRespHtml); # <div class="mainListInfo"> # <div class="mainListName"><span class="mainSoftName"><a href="/Shtml27341.html" title="《所遇非淑》全集">《所遇非淑》全集</a></span></div><div class="mainListSize">2.06 MB</div><div class="mainListDate"><span class="oldDate"><span class="oldDate">2012-12-16</span></span></div><div class="mainListHist">Jar+TXT版</div> # </div> soup = BeautifulSoup(pageRespHtml, fromEncoding = "GBK" ); foundAllMainList = soup.findAll(name = "span" , attrs = { "class" : "mainSoftName" }); logging.debug( "foundAllMainList=%s" , foundAllMainList); mainListLen = len (foundAllMainList); logging.info( "mainListLen=%s" , mainListLen); for urlIdx,eachMainList in enumerate (foundAllMainList): urlNum = urlIdx + 1 ; logging.info( "-------------- page=%d, url=%d --------------" , pageNum, urlNum); logging.debug( "eachMainList=%s" , eachMainList); href = eachMainList.a[ 'href' ]; logging.debug( "href=%s" , href); logging.info( "eachFileUrl=%s" , eachFileUrl); fileRespHtml = crifanLib.getUrlRespHtml(eachFileUrl); #logging.debug("fileRespHtml=%s", fileRespHtml); soup = BeautifulSoup(fileRespHtml, fromEncoding = "GBK" ); h1 = soup.h1.string; logging.info( "h1=%s" , h1); ebooName = h1 + ".rar" ; # <img src="/skin/newasp/download.gif"> <A oncontextmenu=ThunderNetwork_SetHref(this) onclick='return OnDownloadClick_Simple(this,2)' href='#' thunderResTitle='http://dzs.qisuu.com/2013012903.rar' thunderType='' thunderPid='02503' thunderHref='thunder://QUFodHRwOi8vZHpzLnFpc3V1LmNvbS8yMDEzMDEyOTAzLnJhclpa'class=downLinks>迅雷专用高速下载点</A><br><img src=/skin/newasp/download.gif> <A href='http://dzs.qisuu.com/2013012903.rar'><strong>本站下载地址</strong></A> # </div></div> #foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>http://dzs\.qisuu\.com/\d+\.rar)'", fileRespHtml); #foundEbookAddress = re.search("thunderResTitle='(?P<ebookAddress>http://dzs\.qisuu\.com/\w+\.rar)'", fileRespHtml); #<img src="/skin/newasp/download.gif"> <A oncontextmenu=ThunderNetwork_SetHref(this) onclick='return OnDownloadClick_Simple(this,2)' href='#' thunderResTitle='/soft/download.asp?softid=23411&downid=0&id=67531' thunderType='' thunderPid='02503' thunderHref='thunder://QUEvc29mdC9kb3dubG9hZC5hc3A/c29mdGlkPTIzNDExJmRvd25pZD0wJmlkPTY3NTMxWlo='class=downLinks>迅雷专用高速下载点</A><br><img src=/skin/newasp/download.gif> <A href='/soft/download.asp?softid=23411&downid=0&id=67531'><strong>本站下载地址</strong></A> foundEbookAddress = re.search( "thunderResTitle='(?P<ebookAddress>[^']+)'" , fileRespHtml); logging.debug( "foundEbookAddress=%s" , foundEbookAddress); if (foundEbookAddress): ebookAddress = foundEbookAddress.group( "ebookAddress" ); logging.info( "ebookAddress=%s" , ebookAddress); if (re.match( "/soft/download\.asp\?" , ebookAddress)): #find out real ebook address #-> #it allow download #actually it will auto direct to: logging.info( "downloadAddress=%s" , downloadAddress); fixedEbookAddress = downloadAddress; logging.info( "Found partial ebook address, so fix it to: %s" , fixedEbookAddress); fixedEbookAddress = ebookAddress; else : logging.error( "Can Not recognize this kind of ebook download address %s" , ebookAddress); logging.debug( "fileRespHtml=%s" , fileRespHtml); continue ; #for #title is: 《神魔手下好当差/穿越之傀儡娃娃》全集 ebookFullName = os.path.join(gConst[ 'downloadFolder' ], crifanLib.removeInvalidCharInFilename(ebooName, '_' )); logging.info( "dowloadinging ebookFullName=%s" , ebookFullName); crifanLib.downloadFile(fixedEbookAddress, ebookFullName, True ); #crifanLib.downloadFile(fixedEbookAddress, ebookFullName); else : logging.warning( "Not found ebook address for url=%s" , eachFileUrl); logging.debug( "record its fileRespHtml=\n%s" , fileRespHtml); if (foundEbookAddress = = None ): logging.info(u "this url=%s may be: 此电子书已删除,暂不提供下载" , eachFileUrl); ############################################################################### if __name__ = = "__main__" : scriptSelfName = crifanLib.extractFilename(sys.argv[ 0 ]); logging.basicConfig( level = logging.DEBUG, format = 'LINE %(lineno)-4d %(levelname)-8s %(message)s' , datefmt = '%m-%d %H:%M' , filename = scriptSelfName + ".log" , filemode = 'w' ); # define a Handler which writes INFO messages or higher to the sys.stderr console = logging.StreamHandler(); console.setLevel(logging.INFO); # set a format which is simpler for console use formatter = logging.Formatter( 'LINE %(lineno)-4d : %(levelname)-8s %(message)s' ); # tell the handler to use this format console.setFormatter(formatter); logging.getLogger('').addHandler(console); try : main(); except : logging.exception( "Unknown Error !" ); raise ; |
相关的库,可参考:
- crifanLib
- BeautifulSoup
转载请注明:在路上 » 【记录】写了个Python脚本去从qisuu网站下载电子书