折腾:
期间,已经基本上搞清楚了思路和选择
用Flask去封装百度的语音合成的接口,
内部是用Python的SDK
是直接用pip去安装
且要合适的处理token过期的事情:
如果返回dict,且发现是err_no是502的话,则确定是token无效或过期
则使用refresh_token去重新刷新获得有效的token
重新再去尝试一次
然后正常的话,返回得到mp3的二进制数据
然后弄好了上面的部分,再去考虑:
如何把mp3二进制数据,保存到哪里,返回可访问的url给外部接口使用
后续封装出来的接口,倒是也可以考虑支持两种:
- 直接返回mp3的url
- 返回mp3的二进制数据文件
而返回的类型,可以通过输入参数指定
先去找找:
其中先要去封装百度的语音合成api
其中先要去获取token
要在Flask的服务端,用http请求:
python http
所以去用requests
➜ server pipenv install requests Installing requests… Looking in indexes: https://pypi.python.org/simple Collecting requests Using cached https://files.pythonhosted.org/packages/49/df/50aa1999ab9bde74656c2919d9c0c085fd2b3775fd3eca826012bef76d8c/requests-2.18.4-py2.py3-none-any.whl Collecting idna<2.7,>=2.5 (from requests) Using cached https://files.pythonhosted.org/packages/27/cc/6dd9a3869f15c2edfab863b992838277279ce92663d334df9ecf5106f5c6/idna-2.6-py2.py3-none-any.whl Collecting urllib3<1.23,>=1.21.1 (from requests) Using cached https://files.pythonhosted.org/packages/63/cb/6965947c13a94236f6d4b8223e21beb4d576dc72e8130bd7880f600839b8/urllib3-1.22-py2.py3-none-any.whl Collecting chardet<3.1.0,>=3.0.2 (from requests) Using cached https://files.pythonhosted.org/packages/bc/a9/01ffebfb562e4274b6487b4bb1ddec7ca55ec7510b22e4c51f14098443b8/chardet-3.0.4-py2.py3-none-any.whl Collecting certifi>=2017.4.17 (from requests) Using cached https://files.pythonhosted.org/packages/7c/e6/92ad559b7192d846975fc916b65f667c7b8c3a32bea7372340bfe9a15fa5/certifi-2018.4.16-py2.py3-none-any.whl Installing collected packages: idna, urllib3, chardet, certifi, requests Successfully installed certifi-2018.4.16 chardet-3.0.4 idna-2.6 requests-2.18.4 urllib3-1.22 Adding requests to Pipfile's [packages]… Pipfile.lock (aa619c) out of date, updating to (b165df)… Locking [dev-packages] dependencies… Locking [packages] dependencies… Updated Pipfile.lock (b165df)! Installing dependencies from Pipfile.lock (b165df)… 🐍 ▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉ 23/23 — 0
期间去搞清楚python3的url的encode:
【已解决】Python中给url中的字符串进行encode编码
【总结】
最后就基本上封装出来对应的:
获取token
刷新token
返回合成的语音数据
了,
而如何加到Flask的API中,对外提供访问接口,则是普通的做法,此处再去加上tmp的file的api。
最后代码如下:
config.py
# Audio Synthesis / TTS BAIDU_API_KEY = "SNxxxxxxcaz" BAIDU_SECRET_KEY = "47d7cxxxxx7ba" AUDIO_TEMP_FOLDER = "tmp/audio"
app.py
import os import io import re from urllib.parse import quote import json import uuid import requests ################################################################################ # Global Definitions ################################################################################ """ http://ai.baidu.com/docs#/TTS-API/top 500 不支持输入 501 输入参数不正确 502 token验证失败 503 合成后端错误 """ BAIDU_ERR_NOT_SUPPORT_PARAM = 500 BAIDU_ERR_PARAM_INVALID = 501 BAIDU_ERR_TOKEN_INVALID = 502 BAIDU_ERR_BACKEND_SYNTHESIS_FAILED = 503 ################################################################################ # Global Variables ################################################################################ log = None app = None """ { "access_token": "24.569b3b5b470938a522ce60d2e2ea2506.2592000.1528015602.282335-11192483", "session_key": "9mzdDoR4p/oer6IHdcpJwlbK6tpH5rWqhjMi708ubA8vTgu1OToODZAXf7/963ZpEG7+yEdcdCxXq0Yp9VoSgFCFOSGEIA==", "scope": "public audio_voice_assistant_get audio_tts_post wise_adapt lebo_resource_base lightservice_public hetu_basic lightcms_map_poi kaidian_kaidian ApsMisTest_Test权限 vis-classify_flower lpq_开放 cop_helloScope ApsMis_fangdi_permission smartapp_snsapi_base", "refresh_token": "25.5acf5c4d9fdfdbe577e75f3f2fd137b8.315360000.1840783602.282335-11192483", "session_secret": "121fe91236ef88ab24b2ecab479427ea", "expires_in": 2592000 } """ gCurBaiduRespDict = {} # get baidu token resp dict gTempAudioFolder = “" ################################################################################ # Global Function ################################################################################ def generateUUID(prefix = ""): generatedUuid4 = uuid.uuid4() generatedUuid4Str = str(generatedUuid4) newUuid = prefix + generatedUuid4Str return newUuid #---------------------------------------- # Audio Synthesis / TTS #---------------------------------------- def createAudioTempFolder(): """create foler to save later temp audio files""" global log, gTempAudioFolder # init audio temp folder for later store temp audio file audioTmpFolder = app.config["AUDIO_TEMP_FOLDER"] log.info("audioTmpFolder=%s", audioTmpFolder) curFolderAbsPath = os.getcwd() #'/Users/crifan/dev/dev_root/company/xxx/projects/robotDemo/server' log.info("curFolderAbsPath=%s", curFolderAbsPath) audioTmpFolderFullPath = os.path.join(curFolderAbsPath, audioTmpFolder) log.info("audioTmpFolderFullPath=%s", audioTmpFolderFullPath) if not os.path.exists(audioTmpFolderFullPath): os.makedirs(audioTmpFolderFullPath) gTempAudioFolder = audioTmpFolderFullPath log.info("gTempAudioFolder=%s", gTempAudioFolder) def initAudioSynthesis(): """ init audio synthesis related: init token :return: """ getBaiduToken() createAudioTempFolder() def getBaiduToken(): """get baidu token""" global app, log, gCurBaiduRespDict getBaiduTokenUrlTemplate = "https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=%s&client_secret=%s" getBaiduTokenUrl = getBaiduTokenUrlTemplate % (app.config["BAIDU_API_KEY"], app.config["BAIDU_SECRET_KEY"]) log.info("getBaiduTokenUrl=%s", getBaiduTokenUrl) #https://openapi.baidu.com/oauth/2.0/token?grant_type=client_credentials&client_id=SNjsxxxxcaz&client_secret=47d7c0xxx7ba resp = requests.get(getBaiduTokenUrl) log.info("resp=%s", resp) respJson = resp.json() log.info("respJson=%s", respJson) #{'access_token': '24.8f1f35xxxx5.2592000.1xxx0.282335-11192483', 'session_key': '9mzxxxp5ZUHafqq8m+6KwgZmw==', 'scope': 'public audio_voice_assistant_get audio_tts_post wise_adapt lebo_resource_base lightservice_public hetu_basic lightcms_map_poi kaidian_kaidian ApsMisTest_Test权限 vis-classify_flower lpq_开放 cop_helloScope ApsMis_fangdi_permission smartapp_snsapi_base', 'refresh_token': '25.eb2xxxe.315360000.1841377320.282335-11192483', 'session_secret': 'c0a83630b7b1c46039e360de417d346e', 'expires_in': 2592000} if resp.status_code == 200: gCurBaiduRespDict = respJson log.info("get baidu token resp: %s", gCurBaiduRespDict) else: log.error("error while get baidu token: %s", respJson) #{'error': 'invalid_client', 'error_description': 'Client authentication failed'} #{'error': 'invalid_client', 'error_description': 'unknown client id'} #{'error': 'unsupported_grant_type', 'error_description': 'The authorization grant type is not supported'} def refreshBaiduToken(): """refresh baidu token when current token invalid""" global app, log, gCurBaiduRespDict if gCurBaiduRespDict: refreshBaiduTokenUrlTemplate = "https://openapi.baidu.com/oauth/2.0/token?grant_type=refresh_token&refresh_token=%s&client_id=%s&client_secret=%s" refreshBaiduTokenUrl = refreshBaiduTokenUrlTemplate % (gCurBaiduRespDict["refresh_token"], app.config["BAIDU_API_KEY"], app.config["BAIDU_SECRET_KEY"]) log.info("refreshBaiduTokenUrl=%s", refreshBaiduTokenUrl) #https://openapi.baidu.com/oauth/2.0/token?grant_type=refresh_token&refresh_token=25.1b7xxxxedbb99ea361.31536xx.1841379583.282335-11192483&client_id=SNjsggdYDNWtnlbKhxsPLcaz&client_secret=47dxxxxba resp = requests.get(refreshBaiduTokenUrl) log.info("resp=%s", resp) respJson = resp.json() log.info("respJson=%s", respJson) if resp.status_code == 200: gCurBaiduRespDict = respJson log.info("Ok to refresh baidu token response: %s", gCurBaiduRespDict) else: log.error("error while refresh baidu token: %s", respJson) else: log.error("Can't refresh baidu token for previous not get token") def baiduText2Audio(unicodeText): """call baidu text2audio to generate mp3 audio from text""" global app, log, gCurBaiduRespDict log.info("baiduText2Audio: unicodeText=%s", unicodeText) isOk = False mp3BinData = None errNo = 0 errMsg = "Unknown error" if not gCurBaiduRespDict: errMsg = "Need get baidu token before call text2audio" return isOk, mp3BinData, errNo, errMsg utf8Text = unicodeText.encode("utf-8") log.info("utf8Text=%s", utf8Text) encodedUtf8Text = quote(unicodeText) log.info("encodedUtf8Text=%s", encodedUtf8Text) # http://ai.baidu.com/docs#/TTS-API/top tex = encodedUtf8Text #合成的文本,使用UTF-8编码。小于512个中文字或者英文数字。(文本在百度服务器内转换为GBK后,长度必须小于1024字节) tok = gCurBaiduRespDict["access_token"] #开放平台获取到的开发者access_token(见上面的“鉴权认证机制”段落) cuid = app.config["FLASK_APP_NAME"] #用户唯一标识,用来区分用户,计算UV值。建议填写能区分用户的机器 MAC 地址或 IMEI 码,长度为60字符以内 ctp = 1 #客户端类型选择,web端填写固定值1 lan = "zh" #固定值zh。语言选择,目前只有中英文混合模式,填写固定值zh spd = 5 #语速,取值0-9,默认为5中语速 pit = 5 #音调,取值0-9,默认为5中语调 # vol = 5 #音量,取值0-9,默认为5中音量 vol = 9 per = 0 #发音人选择, 0为普通女声,1为普通男生,3为情感合成-度逍遥,4为情感合成-度丫丫,默认为普通女声 getBaiduSynthesizedAudioTemplate = "http://tsn.baidu.com/text2audio?lan=%s&ctp=%s&cuid=%s&tok=%s&vol=%s&per=%s&spd=%s&pit=%s&tex=%s" getBaiduSynthesizedAudioUrl = getBaiduSynthesizedAudioTemplate % (lan, ctp, cuid, tok, vol, per, spd, pit, tex) log.info("getBaiduSynthesizedAudioUrl=%s", getBaiduSynthesizedAudioUrl) #http://tsn.baidu.com/text2audio?lan=zh&ctp=1&cuid=RobotQA&tok=24.5xxxb5.2592000.1528609737.282335-11192483&vol=5&per=0&spd=5&pit=5&tex=as%20a%20book-collector%2C%20i%20have%20the%20story%20you%20just%20want%20to%20listen%21 resp = requests.get(getBaiduSynthesizedAudioUrl) log.info("resp=%s", resp) respContentType = resp.headers["Content-Type"] respContentTypeLowercase = respContentType.lower() #'audio/mp3' log.info("respContentTypeLowercase=%s", respContentTypeLowercase) if respContentTypeLowercase == "audio/mp3": mp3BinData = resp.content log.info("resp content is binary data of mp3, length=%d", len(mp3BinData)) isOk = True errMsg = "" elif respContentTypeLowercase == "application/json": """ { 'err_detail': 'Invalid params per or lan!', 'err_msg': 'parameter error.', 'err_no': 501, 'err_subcode': 50000, 'tts_logid': 642798357 } { 'err_detail': 'Invalid params per&pdt!', 'err_msg': 'parameter error.', 'err_no': 501, 'err_subcode': 50000, 'tts_logid': 1675521246 } { 'err_detail': 'Access token invalid or no longer valid', 'err_msg': 'authentication failed.', 'err_no': 502, 'err_subcode': 50004, 'tts_logid': 4221215043 } """ log.info("resp content is json -> occur error") isOk = False respDict = resp.json() log.info("respDict=%s", respDict) errNo = respDict["err_no"] errMsg = respDict["err_msg"] + " " + respDict["err_detail"] else: isOk = False errMsg = "Unexpected response content-type: %s" % respContentTypeLowercase return isOk, mp3BinData, errNo, errMsg def doAudioSynthesis(unicodeText): """ do audio synthesis from unicode text if failed for token invalid/expired, will refresh token to do one more retry """ global app, log, gCurBaiduRespDict isOk = False audioBinData = None errMsg = "" # # for debug # gCurBaiduRespDict["access_token"] = "99.5xxx06.2592000.1528015602.282335-11192483" log.info("doAudioSynthesis: unicodeText=%s", unicodeText) isOk, audioBinData, errNo, errMsg = baiduText2Audio(unicodeText) log.info("isOk=%s, errNo=%d, errMsg=%s", isOk, errNo, errMsg) if isOk: errMsg = "" log.info("got synthesized audio binary data length=%d", len(audioBinData)) else: if errNo == BAIDU_ERR_TOKEN_INVALID: log.warning("Token invalid -> refresh token") refreshBaiduToken() isOk, audioBinData, errNo, errMsg = baiduText2Audio(unicodeText) log.info("after refresh token: isOk=%ss, errNo=%s, errMsg=%s", isOk, errNo, errMsg) else: log.warning("try synthesized audio occur error: errNo=%d, errMsg=%s", errNo, errMsg) audioBinData = None log.info("return isOk=%s, errMsg=%s", isOk, errMsg) if audioBinData: log.info("audio binary bytes=%d", len(audioBinData)) return isOk, audioBinData, errMsg def testAudioSynthesis(): global app, log, gCurBaiduRespDict, gTempAudioFolder testInputUnicodeText = u"as a book-collector, i have the story you just want to listen!" isOk, audioBinData, errMsg = doAudioSynthesis(testInputUnicodeText) if isOk: audioBinDataLen = len(audioBinData) log.info("Now will save audio binary data %d bytes to file", audioBinDataLen) # 1. save mp3 binary data into tmp file newUuid = generateUUID() log.info("newUuid=%s", newUuid) tempFilename = newUuid + ".mp3" log.info("tempFilename=%s", tempFilename) if not gTempAudioFolder: createAudioTempFolder() tempAudioFullname = os.path.join(gTempAudioFolder, tempFilename) #'/Users/crifan/dev/dev_root/company/xxx/projects/robotDemo/server/tmp/audio/2aba73d1-f8d0-4302-9dd3-d1dbfad44458.mp3' log.info("tempAudioFullname=%s", tempAudioFullname) with open(tempAudioFullname, 'wb') as tmpAudioFp: log.info("tmpAudioFp=%s", tmpAudioFp) tmpAudioFp.write(audioBinData) tmpAudioFp.close() log.info("Done to write audio data into file of %d bytes", audioBinDataLen) # TODO: # 2. use celery to delay delete tmp file else: log.warning("Fail to get synthesis audio for errMsg=%s", errMsg) #---------------------------------------- # Flask API #---------------------------------------- def sendFile(fileBytes, contentType, outputFilename): """Flask API use this to send out file (to browser, browser can directly download file)""" return send_file( io.BytesIO(fileBytes), # io.BytesIO(fileObj.read()), mimetype=contentType, as_attachment=True, attachment_filename=outputFilename ) ################################################################################ # Global Init App ################################################################################ app = Flask(__name__) CORS(app) app.config.from_object('config.DevelopmentConfig') # app.config.from_object('config.ProductionConfig') logFormatterStr = app.config["LOG_FORMAT"] logFormatter = logging.Formatter(logFormatterStr) fileHandler = RotatingFileHandler( app.config['LOG_FILE_FILENAME'], maxBytes=app.config["LOF_FILE_MAX_BYTES"], backupCount=app.config["LOF_FILE_BACKUP_COUNT"], encoding="UTF-8") fileHandler.setLevel(logging.DEBUG) fileHandler.setFormatter(logFormatter) app.logger.addHandler(fileHandler) app.logger.setLevel(logging.DEBUG) # set root log level log = app.logger log.debug("app=%s", app) log.info("app.config=%s", app.config) api = Api(app) log.info("api=%s", api) aiContext = Context() log.info("aiContext=%s", aiContext) initAudioSynthesis() # testAudioSynthesis() class RobotQaAPI(Resource): def processResponse(self, respDict): """ process response dict before return generate audio for response text part """ global log, gTempAudioFolder tmpAudioUrl = "" unicodeText = respDict["data"]["response"]["text"] log.info("unicodeText=%s") if not unicodeText: log.info("No response text to do audio synthesis") return jsonify(respDict) isOk, audioBinData, errMsg = doAudioSynthesis(unicodeText) if isOk: audioBinDataLen = len(audioBinData) log.info("audioBinDataLen=%s", audioBinDataLen) # 1. save mp3 binary data into tmp file newUuid = generateUUID() log.info("newUuid=%s", newUuid) tempFilename = newUuid + ".mp3" log.info("tempFilename=%s", tempFilename) if not gTempAudioFolder: createAudioTempFolder() tempAudioFullname = os.path.join(gTempAudioFolder, tempFilename) log.info("tempAudioFullname=%s", tempAudioFullname) # 'xxx/tmp/audio/2aba73d1-f8d0-4302-9dd3-d1dbfad44458.mp3' with open(tempAudioFullname, 'wb') as tmpAudioFp: log.info("tmpAudioFp=%s", tmpAudioFp) tmpAudioFp.write(audioBinData) tmpAudioFp.close() log.info("Saved %d bytes data into temp audio file %s", audioBinDataLen, tempAudioFullname) # TODO: # 2. use celery to delay delete tmp file # generate temp audio file url # /tmp/audio tmpAudioUrl = "http://%s:%d/tmp/audio/%s" % ( app.config["FILE_URL_HOST"], app.config["FLASK_PORT"], tempFilename) log.info("tmpAudioUrl=%s", tmpAudioUrl) respDict["data"]["response"]["audioUrl"] = tmpAudioUrl else: log.warning("Fail to get synthesis audio for errMsg=%s", errMsg) log.info("respDict=%s", respDict) return jsonify(respDict) def get(self): respDict = { "code": 200, "message": "generate response ok", "data": { "input": "", "response": { "text": "", "audioUrl": "" }, "control": "", "audio": {} } } 。。。 respDict["data"]["audio"] = { "contentType": audioFileObj.contentType, "name": audioFileObj.filename, "size": audioFileObj.length, "url": "http://%s:%d/files/%s/%s" % (app.config["FILE_URL_HOST"], app.config["FLASK_PORT"], audioFileObj._id, encodedFilename) } log.info("respDict=%s", respDict) return self.processResponse(respDict) class TmpAudioAPI(Resource): def get(self, filename=None): global gTempAudioFolder log.info("TmpAudioAPI: filename=%s", filename) tmpAudioFullPath = os.path.join(gTempAudioFolder, filename) log.info("tmpAudioFullPath=%s", tmpAudioFullPath) if not os.path.isfile(tmpAudioFullPath): log.warning("Not exists file %s", tmpAudioFullPath) respDict = { "code": 404, "message": "Can not find temp audio file %s" % filename, "data": {} } return jsonify(respDict) fileSize = os.path.getsize(tmpAudioFullPath) log.info("fileSize=%s", fileSize) with open(tmpAudioFullPath, "rb") as tmpAudioFp: fileBytes = tmpAudioFp.read() log.info("read out fileBytes length=%s", len(fileBytes)) outputFilename = filename # contentType = "audio/mp3" # chrome use this contentType = "audio/mpeg" # most common and compatible return sendFile(fileBytes, contentType, outputFilename) api.add_resource(RobotQaAPI, '/qa', endpoint='qa') api.add_resource(TmpAudioAPI, '/tmp/audio/<filename>', endpoint='TmpAudio')
如此即可实现:
访问qa,返回内容中包含了,从百度的语音合成接口返回的二进制的mp3数据,保存到临时文件夹后,并返回生成的本地的Flask的tmp的file的url:
然后再去访问该url
http://127.0.0.1:32851/tmp/audio/edaf4826-8f8a-4c31-8de5-1ce2debff2e8.mp3
可以下载到mp3文件:
剩下的就是:
【未解决】Flask中如何保存临时文件且可以指定有效期
了。