现在去中断2.2T的处理:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 | 20181129 09 : 56 : 09 881 INFO [ 8 15 : 11 : 27.562 ] Processing shows of user: 35428129 20181129 09 : 56 : 10 566 INFO [ 8 15 : 11 : 28.361 ] N [ 32581 / 182046 / 214627 ][ show_157976292_video.mp4] - > False , Can not find video file : / Volumes / 内容备份 2 / Backup / 20180601 / Offline / xxx / user / 35428129 / show / 157976292 / show_157976292_video.mp4 20181129 09 : 56 : 18 566 INFO [ 8 15 : 11 : 36.474 ] N [ 32581 / 182047 / 214628 ][ show_158073728_video.mp4] - > False , Can not find course subtitle file : / Volumes / 内容备份 2 / Backup / 20180601 / Offline / xxx / course / 55378 / course_55378_subtitle.srt 20181129 09 : 56 : 27 566 INFO [ 8 15 : 11 : 45.323 ] N [ 32581 / 182048 / 214629 ][ show_158215119_video.mp4] - > False , Can not find course subtitle file : / Volumes / 内容备份 2 / Backup / 20180601 / Offline / xxx / course / 61084 / course_61084_subtitle.srt ^CTraceback (most recent call last): File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd_launcher.py" , line 38 , in <module> main(sys.argv) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/__main__.py" , line 265 , in main wait = args.wait) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/__main__.py" , line 256 , in handle_args run_main(addr, name, kind, * extra, * * kwargs) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_local.py" , line 52 , in run_main runner(addr, name, kind = = 'module' , * extra, * * kwargs) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/runner.py" , line 32 , in run set_trace = False ) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_vendored/pydevd/pydevd.py" , line 1283 , in run return self ._exec(is_module, entry_point_fn, module_name, file , globals , locals ) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_vendored/pydevd/pydevd.py" , line 1290 , in _exec pydev_imports. execfile ( file , globals , locals ) # execute the script File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_vendored/pydevd/_pydev_imps/_pydev_execfile.py" , line 25 , in execfile exec ( compile (contents + "\n" , file , 'exec' ), glob, loc) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 892 , in <module> processVideo() File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 882 , in processVideo processUserShows(userId) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 870 , in processUserShows processSingleShow(userId, showId) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 719 , in processSingleShow showFailLogAndSaveFailResult(showVideoFilename, errMsg, userId, showId, courseId) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 569 , in showFailLogAndSaveFailResult saveErrorResult(errInfoDict) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 544 , in saveErrorResult saveJsonToFile(ResultFileFail, resultDict) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py" , line 161 , in saveJsonToFile json.dump(jsonValue, jsonFp, indent = 2 , ensure_ascii = False ) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py" , line 180 , in dump fp.write(chunk) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/codecs.py" , line 721 , in write return self .writer.write(data) KeyboardInterrupt ^C macdeMacBook - Pro:crawler_projects mac$ |
然后可以看到:
- 已处理:214629=21万
- 成功:32581=3.2万
现在去拷贝已处理的数据到单独的硬盘中
继续用之前的代码,修改后:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | import shutil import os import codecs import json SrcRootFolder = "/Volumes/内容备份2/Backup/20180601/Offline/xxx/" DestRootFolder = "/Volumes/外借5/xxx_2.2T_processed/" ProcessResultJsonFile = "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/debug/processResult_2T/processResult_processed.json" # MaxSampleNum = 200 MaxSampleNum = 0 # not limit -> copy all def generateUserFoler(rootFolder, userId): return os.path.join(rootFolder, "user" , userId) def generateUserShowFoler(rootFolder, userId, showId): return os.path.join(rootFolder, "user" , userId, "show" , showId) def generateCourseFoler(rootFolder, courseId): return os.path.join(rootFolder, "course" , courseId) def loadJsonFromFile(fullFilename): """load and parse json dict from file""" with codecs. open (fullFilename, 'r' , encoding = "utf-8" ) as jsonFp: jsonDict = json.load(jsonFp) print ( "Complete load json from %s" % fullFilename) return jsonDict def copyFolder(sourceFolerPath, destFolderPath): print ( "copyFolder: sourceFolerPath=%s, destFolderPath=%s" % (sourceFolerPath, destFolderPath)) if os.path.exists(destFolderPath): shutil.rmtree(destFolderPath) print ( "Deleted existing destination folder: %s" % destFolderPath) copyResult = shutil.copytree(sourceFolerPath, destFolderPath) print ( "copyResult=%s" % copyResult) return copyResult if __name__ = = "__main__" : processResultJson = loadJsonFromFile(ProcessResultJsonFile) # print("processResultJson=%s" % processResultJson) processedShowsDict = processResultJson[ "shows" ] copiedNum = 0 for eachShowId in processedShowsDict.keys(): if MaxSampleNum > 0 : if copiedNum > MaxSampleNum: break print ( "---------- [%d] eachShowId=%s" % (copiedNum, eachShowId)) processedShowInfo = processedShowsDict[eachShowId] courseId = processedShowInfo[ "course_id" ] userId = processedShowInfo[ "uid" ] processSuccessedNum = processedShowInfo[ "result" ][ "success" ] if processSuccessedNum < = 0 : continue srcShowFolder = generateUserShowFoler(SrcRootFolder, userId, eachShowId) destShowFolder = generateUserShowFoler(DestRootFolder, userId, eachShowId) copyFolder(srcShowFolder, destShowFolder) srcCourseFolder = generateCourseFoler(SrcRootFolder ,courseId) destCourseFolder = generateCourseFoler(DestRootFolder ,courseId) copyFolder(srcCourseFolder, destCourseFolder) copiedNum + = 1 print ( "+++++++++ Completed copy %d sample data" % copiedNum) |
去运行:

拷贝速度还是很快的,一会就拷贝了几千个了。
转载请注明:在路上 » 【记录】中断2.2T趣配音数据处理并拷贝出已处理数据