【已解决】写脚本下载crifan的所有电子书

应朋友要求，希望把我crifan的所有电子书打包发给他供参考。
本来打算手动下载的，突然发现有100+本，手动下载效率较低。
所以考虑用Python脚本去实现。
输入文件用之前的：
crifan/crifan_ebook_readme: Crifan的电子书的使用说明
去写代码
思路是：
把docbook和Gitbook的所有的book都提取出来
然后生成对应的pdf文件（可设置其他不同格式）的url连接
然后再用Python的requests去下载
【总结】
弄了一个晚上，最后优化是：
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
# Function: download all crifan ebook files
# Author: Crifan Li
# Update: 20201212
# Latest: https://github.com/crifan/crifan_ebook_readme/blob/master/downloadAllBooks.py
 
 
import os
import codecs
import re
import requests
import time
 
 
################################################################################
# Config
################################################################################
 
 
OutputFolder = os.path.join("downloaded", "pdf")
# 'downloaded/pdf'
 
 
RequestsProxies = {
    "http"  : "http://127.0.0.1:58591",
    "https" : "http://127.0.0.1:58591",
}
################################################################################
# Util Functions
################################################################################
 
 
def createFolder(folderFullPath):
    """
        create folder, even if already existed
        Note: for Python 3.2+
    """
    os.makedirs(folderFullPath, exist_ok=True)
 
 
def loadTextFromFile(fullFilename, fileEncoding="utf-8"):
    """load file text content from file"""
    with codecs.open(fullFilename, 'r', encoding=fileEncoding) as fp:
        allText = fp.read()
        # logging.debug("Complete load text from %s", fullFilename)
        return allText
 
 
def isFileExistAndValid(filePath, fullFileSize=None):
    """Check file exist and valid or not
 
 
    Args:
        filePath (str): file path
        fullFileSize (int): full file size
    Returns:
        existed and valid (bool)
    Raises:
    Examples:
    """
    isExistFile = os.path.isfile(filePath)
    isValidFile = False
    if isExistFile:
        curFileSize = os.path.getsize(filePath) # 260900226
        if fullFileSize:
            isValidFile = curFileSize == fullFileSize
        else:
            isValidFile = curFileSize > 0
    isExistAndValid = isExistFile and isValidFile
    return isExistAndValid
 
 
def floatSecondsToDatetimeDict(floatSeconds):
    """
        convert float seconds(time delta) to datetime dict{days, hours, minutes, seconds, millseconds, microseconds}
 
 
        example: 96400.3765293 -> {'days': 1, 'hours': 2, 'minutes': 46, 'seconds': 40, 'millseconds': 376, 'microseconds': 529}
    """
    secondsInt = int(floatSeconds)
    decimalsFloat = floatSeconds - secondsInt
    millisecondsFloat = decimalsFloat * 1000
    millisecondsInt = int(millisecondsFloat)
    microsecondsDecimal = millisecondsFloat - millisecondsInt
    microsecondsInt = int(microsecondsDecimal * 1000)
 
 
    minutes, seconds = divmod(secondsInt, 60)
    hours, minutes = divmod(minutes, 60)
    days, hours = divmod(hours, 24)
 
 
    convertedDict = {
        "days": days,
        "hours": hours,
        "minutes": minutes,
        "seconds": seconds,
        "millseconds": millisecondsInt,
        "microseconds": microsecondsInt,
    }
 
 
    return convertedDict
 
 
def datetimeDictToStr(datetimeDict,
        seperatorD=" ",
        seperatorHms=":",
        seperatorMilliS=".",
        isShowZeroDayStr=False,
        isShowMilliSecPart=True,
    ):
    """Convert date time dict into date time string
     
    Args:
        datetimeDict (dict): date time dict
        seperatorD (str): day seperator
        seperatorHms (str): hour/minute/second seperator
        seperatorMilliS (str): milli seconds seperator
        isShowZeroDayStr (bool): whether show days string when days=0
        isShowMilliSecPart (bool): whether show milli seconds part
    Returns:
        str
    Raises:
    Examples:
        input: 
            {'days': 0, 'hours': 0, 'microseconds': 986, 'millseconds': 804, 'minutes': 3, 'seconds': 38}
            {'hours': 0, minutes': 3, 'seconds': 38}
        output:
            '0 00:03:38.804'
            '00:03:38'
    """
    dayStr = ""
    hasDays = "days" in datetimeDict
    if hasDays:
        days = datetimeDict["days"]
        if (not isShowZeroDayStr) and (days == 0):
            dayStr = ""
        else:
            dayStr = "%d%s" % (days, seperatorD)
     
    hmsStr = "%02d%s%02d%s%02d" % (datetimeDict["hours"], seperatorHms, datetimeDict["minutes"], seperatorHms, datetimeDict["seconds"]) # '00:03:12'
 
 
    milliSecStr = ""
    hasMilliSec = "millseconds" in datetimeDict
    if hasMilliSec:
        if isShowMilliSecPart:
            milliSecStr = "%s%03d" % (seperatorMilliS, datetimeDict["millseconds"])
 
 
    formattedStr = "%s%s%s" % (dayStr, hmsStr, milliSecStr) # '00:03:12'
    return formattedStr
 
 
def formatSize(sizeInBytes, decimalNum=1, isUnitWithI=False, sizeUnitSeperator=""):
    """
    format size to human readable string
 
 
    example:
        3746 -> 3.7KB
        87533 -> 85.5KiB
        98654 -> 96.3 KB
        352 -> 352.0B
        76383285 -> 72.84MB
        763832854988542 -> 694.70TB
        763832854988542665 -> 678.4199PB
 
 
    refer:
        https://stackoverflow.com/questions/1094841/reusable-library-to-get-human-readable-version-of-file-size
    """
    # https://en.wikipedia.org/wiki/Binary_prefix#Specific_units_of_IEC_60027-2_A.2_and_ISO.2FIEC_80000
    # K=kilo, M=mega, G=giga, T=tera, P=peta, E=exa, Z=zetta, Y=yotta
    sizeUnitList = ['','K','M','G','T','P','E','Z']
    largestUnit = 'Y'
 
 
    if isUnitWithI:
        sizeUnitListWithI = []
 
 
        for curIdx, eachUnit in enumerate(sizeUnitList):
            unitWithI = eachUnit
            if curIdx >= 1:
                unitWithI += 'i'
 
 
            sizeUnitListWithI.append(unitWithI)
 
 
        # sizeUnitListWithI = ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']
        sizeUnitList = sizeUnitListWithI
 
 
        largestUnit += 'i'
 
 
    suffix = "B"
    decimalFormat = "." + str(decimalNum) + "f" # ".1f"
    finalFormat = "%" + decimalFormat + sizeUnitSeperator + "%s%s" # "%.1f%s%s"
    sizeNum = sizeInBytes
    for sizeUnit in sizeUnitList:
        if abs(sizeNum) < 1024.0:
            return finalFormat % (sizeNum, sizeUnit, suffix)
        sizeNum /= 1024.0
    return finalFormat % (sizeNum, largestUnit, suffix)
 
 
def getFileSizeFromUrl(fileUrl, proxies=None):
    """Get file size from file url
 
 
    Args:
        fileUrl (str): file url
        proxies (dict): requests proxies
    Returns:
        file size or 0 mean fail to get
    Raises:
    Examples:
        input: https://gameapktxdl.vivo.com.cn/appstore/developer/soft/20201020/202010201805243ed5v.apk
        output: 154551625
    """
    totalFileSize = None
 
 
    try:
        resp = requests.get(fileUrl, stream=True, proxies=proxies)
        respHeaders = resp.headers
        # {'Date': 'Thu, 10 Dec 2020 05:27:10 GMT', 'Content-Type': 'application/vnd.android.package-archive', 'Content-Length': '154551625', 'Connection': 'keep-alive', 'Server': 'NWS_TCloud_static_msoc1_xz', 'Cache-Control': 'max-age=600', 'Expires': 'Thu, 10 Dec 2020 05:37:09 GMT', 'Last-Modified': 'Thu, 09 Jan 2020 11:21:35 GMT', 'X-NWS-UUID-VERIFY': '94db2d14f135898d924fb249b13a0964', 'X-Verify-Code': '2871bd7acf67c7e298e9c8d8c865e27d', 'X-NWS-LOG-UUID': 'a83536f2-ab83-465d-ba09-0e19a15cc706', 'X-Cache-Lookup': 'Hit From Disktank3, Hit From Inner Cluster', 'Accept-Ranges': 'bytes', 'ETag': '"46C50A5CADB6BEE339236477BB6DDC14"', 'X-Daa-Tunnel': 'hop_count=2'}
        # {'Server': 'Tengine', 'Date': 'Fri, 11 Dec 2020 14:11:00 GMT', 'Content-Type': 'application/pdf', 'Content-Length': '24422168', 'Last-Modified': 'Fri, 18 Sep 2020 09:56:15 GMT', 'Connection': 'keep-alive', 'ETag': '"5f64843f-174a718"', 'Strict-Transport-Security': 'max-age=15768000', 'Accept-Ranges': 'bytes'}
        contentLengthStr = respHeaders['Content-Length'] # '154551625', '24422168'
        contentLengthInt = int(contentLengthStr) # 154551625, 24422168
        totalFileSize = contentLengthInt
    except:
        totalFileSize = None
 
 
    return totalFileSize # 154551625
 
 
def streamingDownloadFile(
        url,
        fileToSave=None,
        proxies=None,
        isShowSpeed=True,
        chunkSize=1024*512,
        resumeSize=0,
        totalSize=0,
    ):
    """Download file using stream mode, support showing process with current speed, percent, size
 
 
    Args:
        url (str): file online url
        fileToSave (str): filename or full file path
        proxies (dict): requests proxies
        isShowSpeed (bool): show downloading speed or not
        chunkSize (int): when showing download speed, need use stream downloading, need set chunck size
        resumeSize (bool): the size to start download, normally is the local already downloaded size
        totalSize (int): total file size, only used for calculate downloaded percent
    Returns:
        download ok or not (bool)
    Raises:
    Examples:
    """
    isDownloadOk = False
 
 
    if isShowSpeed:
        if totalSize == 0:
            gotTotalSize = getFileSizeFromUrl(url, proxies) # 154551625
            if gotTotalSize:
                totalSize = gotTotalSize
 
 
    headers = {
        'Range': 'bytes=%d-' % resumeSize,
        # "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36",
    }
    resp = requests.get(url, proxies=proxies, headers=headers, stream=True)
    curDownloadedSize = 0
    with open(fileToSave, "ab") as f:
        startTime = time.time()
        prevTime = startTime
        for curChunkBytes in resp.iter_content(chunk_size=chunkSize):
            if curChunkBytes:
                curTime = time.time() # 1606456020.0718982
                f.write(curChunkBytes)
                f.flush()
 
 
                curChunkSize = len(curChunkBytes) # 524288
                curDownloadedSize += curChunkSize # 524288
                totalDownloadedSize = curDownloadedSize + resumeSize # 12058624
                totalDownloadedSizeStr = formatSize(totalDownloadedSize) # '11.5MB'
 
 
                curDownloadTime = curTime - prevTime # 15.63818907737732
                curSpeed = curChunkSize / curDownloadTime # 670522.651191692
                curSpeedStr = formatSize(curSpeed) # '231.3KB'
 
 
                totalDownloadTime = curTime - startTime # 15.63818907737732
                averageSpeed = curDownloadedSize / totalDownloadTime # 670522.651191692
                averageSpeedStr = formatSize(averageSpeed) # '231.3KB'
 
 
                totalDownloadTimeDict = floatSecondsToDatetimeDict(totalDownloadTime)
                totalDownloadTimeStr = datetimeDictToStr(totalDownloadTimeDict, isShowMilliSecPart=False)
 
 
                if isShowSpeed:
                    showStr = "downloading speed: cur=%s/s, avg=%s/s, time: total=%s, size: %s" % (curSpeedStr, averageSpeedStr, totalDownloadTimeStr, totalDownloadedSizeStr)
 
 
                    if totalSize > 0:
                        downloadedPercent100 = round(100 * totalDownloadedSize / totalSize, 2) # 47.23
                        downloadedPercent100Str = str(downloadedPercent100) # '47.23'
                        percentStr = ", percent: %s%%" % downloadedPercent100Str # ', percent: 47.23%'
                    else:
                        percentStr = ""
 
 
                    showStr += percentStr
                    # 'downloading speed: cur=231.3KB/s, avg=231.3KB/s, time: total=00:00:02, size: 11.5MB, percent: 49.38%'
                    print(showStr)
 
 
                prevTime = curTime
 
 
    return isDownloadOk
 
 
def downloadFile(url,
        fileToSave=None,
        proxies=None,
        isStreamMode=True,
        isResume=True,
    ):
    """Download file from url then save to file
 
 
    Args:
        url (str): file online url
        fileToSave (str): filename or full file path
        proxies (dict): requests proxies
        isStreamMode (bool): use stream mode or not
    Returns:
        download ok or not (bool)
    Raises:
    Examples:
        input: 
            'https://book.crifan.com/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf'
            'downloaded/pdf/5g_message_rcs_tech_summary.pdf'
        output:
            True
    """
    isDownloadOk = False
 
 
    if not fileToSave:
        urlPartList = url.split("/")
        fileToSave = urlPartList[-1]
 
 
    try:
        if isStreamMode:
            totalFileSize = getFileSizeFromUrl(url, proxies) # 154551625
            if not totalFileSize:
                print("Failed to get total file size from %s" % url)
                return isDownloadOk
 
 
            totalSizeStr = formatSize(totalFileSize)
            print("Get total file size %s from %s" % (totalSizeStr, url))
 
 
            isDownloadedAndValid = isFileExistAndValid(fileToSave, fullFileSize=totalFileSize)
            if isDownloadedAndValid:
                print("%s is already download" % fileToSave)
                isDownloadOk = True
                return isDownloadOk
 
 
            curDownloadedSize = 0
            isExistFile = os.path.isfile(fileToSave)
            if isExistFile:
              curDownloadedSize = os.path.getsize(fileToSave)
              curDownloadedSizeStr = formatSize(curDownloadedSize)
              print("Already downloaded %s for %s" % (curDownloadedSizeStr, fileToSave))
 
 
              if curDownloadedSize > totalFileSize:
                  # possible is local is new version, so consider as downloaded
                  print("Downloaded=%s > online=%s, consider as downloaded" % (curDownloadedSizeStr, totalSizeStr))
                  isDownloadOk = True
                  return isDownloadOk
 
 
            if not isResume:
                curDownloadedSize = 0
 
 
            isDownloadOk = streamingDownloadFile(
              url,
              fileToSave=fileToSave,
              proxies=proxies,
              isShowSpeed=True,
              resumeSize=curDownloadedSize,
              totalSize=totalFileSize,
            )
        else:
            resp = requests.get(url, proxies=proxies)
            with open(fileToSave, 'wb') as saveFp:
                saveFp.write(resp.content)
                isDownloadOk = True
    except BaseException as curException:
        print("Exception %s when download %s to %s" % (curException, url, fileToSave))
 
 
    return isDownloadOk
 
 
################################################################################
# Const & Config & Settings
################################################################################
 
 
InputMdFile = "README.md"
 
 
################################################################################
# Main
################################################################################
 
 
mdStr = loadTextFromFile(InputMdFile)
# print("mdStr=%s" % mdStr)
 
 
allPdfUrlList = []
 
 
# * [Notepad++](https://www.crifan.com/files/doc/docbook/rec_soft_npp/release/html/rec_soft_npp.html)
# * [硬件电路基础知识](https://www.crifan.com/files/doc/docbook/hardware_basic/release/html/hardware_basic.html)
# allDocbookIter = re.finditer("https?://www\.crifan\.com/files/doc/docbook/(?P<bookName>\w+)/release/", mdStr)
# <callable_iterator object at 0x1094acfd0>
# allDocbookList = list(allDocbookIter)
# [<re.Match object; sp...soft_dev_>, <re.Match object; sp...rec_soft_>, <re.Match object; sp...programmi>, <re.Match object; sp...language_>, <re.Match object; sp...json_tuto>, <re.Match object; sp...char_enco>, <re.Match object; sp...char_enco>, <re.Match object; sp...python_to>, <re.Match object; sp...regular_e>, <re.Match object; sp...python_to>, <re.Match object; sp...csharp_su>, <re.Match object; sp...build_web>, <re.Match object; sp...website_t>, <re.Match object; sp...web_scrap>, ...]
 
 
# allDocbookList = re.findall("https?://www\.crifan\.com/files/doc/docbook/\w+/release/", mdStr)
allDocbookNameList = re.findall("https?://www\.crifan\.com/files/doc/docbook/(\w+)/release/", mdStr)
# ['char_encoding_usage', 'crifanlib_python', ...]
docbookTotalNum = len(allDocbookNameList)
print("docbookTotalNum=%s" % docbookTotalNum) # 113
uniqueDocbookNameSet = set(allDocbookNameList)
uniqueDocbookNameList = list(uniqueDocbookNameSet)
uniqueDocbookTotalNum = len(uniqueDocbookNameList)
print("uniqueDocbookTotalNum=%s" % uniqueDocbookTotalNum) # 56
 
 
for curIdx, eachDocbookName in enumerate(uniqueDocbookNameList):
  curNum = curIdx + 1
  curDocbookPdfUrl = "https://www.crifan.com/files/doc/docbook/%s/release/pdf/%s.pdf" % (eachDocbookName, eachDocbookName)
  # print("[%d] curDocbookPdfUrl=%s" % (curNum, curDocbookPdfUrl))
  # https://www.crifan.com/files/doc/docbook/rec_soft_npp/release/pdf/rec_soft_npp.pdf
  # https://www.crifan.com/files/doc/docbook/soft_dev_basic/release/pdf/soft_dev_basic.pdf
  allPdfUrlList.append(curDocbookPdfUrl)
 
 
# * [风格](https://book.crifan.com/books/program_code_style/website)
# * [VSCode](http://book.crifan.com/books/best_editor_vscode/website)
allGitbookNameList = re.findall("https?://book\.crifan\.com/books/(\w+)/website", mdStr)
gitbooNameTotalNum = len(allGitbookNameList)
print("gitbooNameTotalNum=%s" % gitbooNameTotalNum) # 139
uniqueGitbookNameSet = set(allGitbookNameList)
uniqueGitbookNameList = list(uniqueGitbookNameSet)
uniqueGitbookNameTotalNum = len(uniqueGitbookNameList)
print("uniqueGitbookNameTotalNum=%s" % uniqueGitbookNameTotalNum) # 71
 
 
for curIdx, eachGitbookName in enumerate(uniqueGitbookNameList):
  curNum = curIdx + 1
  curGitbookPdfUrl = "https://book.crifan.com/books/%s/pdf/%s.pdf" % (eachGitbookName, eachGitbookName)
  # print("[%d] curGitbookPdfUrl=%s" % (curNum, curGitbookPdfUrl))
  # https://book.crifan.com/books/best_editor_vscode/pdf/best_editor_vscode.pdf
  # https://book.crifan.com/books/scientific_network_summary/pdf/scientific_network_summary.pdf
  allPdfUrlList.append(curGitbookPdfUrl)
 
 
allPdfUrlList.sort()
 
 
pdfUrlTotalNum = len(allPdfUrlList)
print("pdfUrlTotalNum=%s" % pdfUrlTotalNum) # 127
 
 
# download all pdf url
for curUrlIdx, eachPdfUrl in enumerate(allPdfUrlList):
  curUrlNum = curUrlIdx + 1
  # 'https://book.crifan.com/books/5g_message_rcs_tech_summary/pdf/5g_message_rcs_tech_summary.pdf'
  pdfFilename = eachPdfUrl.split("/")[-1]
  # '5g_message_rcs_tech_summary.pdf'
  print("%s [%2d/%2d] %s %s" % ("-"*30, curUrlNum, pdfUrlTotalNum, pdfFilename ,"-"*30))
  print("%s" % eachPdfUrl)
  downloadedFilePath = os.path.join(OutputFolder, pdfFilename)
  isDownloadOk = downloadFile(eachPdfUrl, downloadedFilePath, proxies=RequestsProxies)
  print("%s to downloaded %s from %s" % (isDownloadOk, pdfFilename, eachPdfUrl))
【已解决】写脚本下载crifan的所有电子书

与本文相关的文章

Hi，您需要填写昵称和邮箱！

与本文相关的文章

Hi，您需要填写昵称和邮箱！

订阅在路上