现在去中断2.2T的处理:
20181129 09:56:09 881 INFO [8 15:11:27.562] Processing shows of user: 35428129 20181129 09:56:10 566 INFO [8 15:11:28.361] N [ 32581/ 182046/ 214627][ show_157976292_video.mp4] -> False, Can not find video file: /Volumes/内容备份2/Backup/20180601/Offline/xxx/user/35428129/show/157976292/show_157976292_video.mp4 20181129 09:56:18 566 INFO [8 15:11:36.474] N [ 32581/ 182047/ 214628][ show_158073728_video.mp4] -> False, Can not find course subtitle file: /Volumes/内容备份2/Backup/20180601/Offline/xxx/course/55378/course_55378_subtitle.srt 20181129 09:56:27 566 INFO [8 15:11:45.323] N [ 32581/ 182048/ 214629][ show_158215119_video.mp4] -> False, Can not find course subtitle file: /Volumes/内容备份2/Backup/20180601/Offline/xxx/course/61084/course_61084_subtitle.srt ^CTraceback (most recent call last): File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd_launcher.py", line 38, in <module> main(sys.argv) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/__main__.py", line 265, in main wait=args.wait) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/__main__.py", line 256, in handle_args run_main(addr, name, kind, *extra, **kwargs) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_local.py", line 52, in run_main runner(addr, name, kind == 'module', *extra, **kwargs) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/runner.py", line 32, in run set_trace=False) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_vendored/pydevd/pydevd.py", line 1283, in run return self._exec(is_module, entry_point_fn, module_name, file, globals, locals) File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_vendored/pydevd/pydevd.py", line 1290, in _exec pydev_imports.execfile(file, globals, locals) # execute the script File "/Users/mac/.vscode/extensions/ms-python.python-2018.10.1/pythonFiles/experimental/ptvsd/ptvsd/_vendored/pydevd/_pydev_imps/_pydev_execfile.py", line 25, in execfile exec(compile(contents+"\n", file, 'exec'), glob, loc) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 892, in <module> processVideo() File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 882, in processVideo processUserShows(userId) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 870, in processUserShows processSingleShow(userId, showId) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 719, in processSingleShow showFailLogAndSaveFailResult(showVideoFilename, errMsg, userId, showId, courseId) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 569, in showFailLogAndSaveFailResult saveErrorResult(errInfoDict) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 544, in saveErrorResult saveJsonToFile(ResultFileFail, resultDict) File "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/processVideo/ProcessxxxVideo.py", line 161, in saveJsonToFile json.dump(jsonValue, jsonFp, indent=2, ensure_ascii=False) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/json/__init__.py", line 180, in dump fp.write(chunk) File "/Library/Frameworks/Python.framework/Versions/3.7/lib/python3.7/codecs.py", line 721, in write return self.writer.write(data) KeyboardInterrupt ^C macdeMacBook-Pro:crawler_projects mac$
然后可以看到:
- 已处理:214629=21万
- 成功:32581=3.2万
现在去拷贝已处理的数据到单独的硬盘中
继续用之前的代码,修改后:
import shutil import os import codecs import json SrcRootFolder = "/Volumes/内容备份2/Backup/20180601/Offline/xxx/" DestRootFolder = "/Volumes/外借5/xxx_2.2T_processed/" ProcessResultJsonFile = "/Users/mac/working/dev_root/xxx/projects/crawler_projects/crawler_xxx_app/debug/processResult_2T/processResult_processed.json" # MaxSampleNum = 200 MaxSampleNum = 0 # not limit -> copy all def generateUserFoler(rootFolder, userId): return os.path.join(rootFolder, "user", userId) def generateUserShowFoler(rootFolder, userId, showId): return os.path.join(rootFolder, "user", userId, "show", showId) def generateCourseFoler(rootFolder, courseId): return os.path.join(rootFolder, "course", courseId) def loadJsonFromFile(fullFilename): """load and parse json dict from file""" with codecs.open(fullFilename, 'r', encoding="utf-8") as jsonFp: jsonDict = json.load(jsonFp) print("Complete load json from %s" % fullFilename) return jsonDict def copyFolder(sourceFolerPath, destFolderPath): print("copyFolder: sourceFolerPath=%s, destFolderPath=%s" % (sourceFolerPath, destFolderPath)) if os.path.exists(destFolderPath): shutil.rmtree(destFolderPath) print("Deleted existing destination folder: %s" % destFolderPath) copyResult = shutil.copytree(sourceFolerPath, destFolderPath) print("copyResult=%s" % copyResult) return copyResult if __name__ == "__main__": processResultJson = loadJsonFromFile(ProcessResultJsonFile) # print("processResultJson=%s" % processResultJson) processedShowsDict = processResultJson["shows"] copiedNum = 0 for eachShowId in processedShowsDict.keys(): if MaxSampleNum > 0: if copiedNum > MaxSampleNum: break print("---------- [%d] eachShowId=%s" % (copiedNum, eachShowId)) processedShowInfo = processedShowsDict[eachShowId] courseId = processedShowInfo["course_id"] userId = processedShowInfo["uid"] processSuccessedNum = processedShowInfo["result"]["success"] if processSuccessedNum <= 0: continue srcShowFolder = generateUserShowFoler(SrcRootFolder, userId, eachShowId) destShowFolder = generateUserShowFoler(DestRootFolder, userId, eachShowId) copyFolder(srcShowFolder, destShowFolder) srcCourseFolder = generateCourseFoler(SrcRootFolder ,courseId) destCourseFolder = generateCourseFoler(DestRootFolder ,courseId) copyFolder(srcCourseFolder, destCourseFolder) copiedNum += 1 print("+++++++++ Completed copy %d sample data" % copiedNum)
去运行:
拷贝速度还是很快的,一会就拷贝了几千个了。
转载请注明:在路上 » 【记录】中断2.2T趣配音数据处理并拷贝出已处理数据