折腾:
已经完成了,语音合成到Flask中了。
接着去考虑在前端页面中,支持调用参数给后端,传递到微软的Azure的tts语音合成中。
方便前端测试不同人声的语音合成的效果。
去前端页面中加配置
之前用的bootstrap,找找里面的列表选择控件:
【已解决】Bootstrap中实现列表选择默认值和获取当前选中的值
再去加上其他的配置,比如🔊音量大小和语音速度
<code> <div class="row"> <div class="col-lg-4 col-md-4 col-sm-6 col-xs-12 "> <div class="input-group"> <span class="input-group-addon">Voice Speed</span> <input type="text" class="form-control" placeholder="eg: -40.00%, +20.00%" id="voiceRate" value="-30.00%"> </div> </div> </div> <div class="row"> <div class="col-lg-4 col-md-4 col-sm-6 col-xs-12 "> <div class="input-group"> <span class="input-group-addon">Voice Volume</span> <input type="text" class="form-control" placeholder="eg: +25.00%, -30.00%" id="voiceVolume" value="+40.00%"> </div> </div> </div> </code>
界面效果:
然后再去写更新后台代码,加上ms的azure的tts的参数设置
<code># def doAudioSynthesis(unicodeText): def doAudioSynthesis(unicodeText, voiceName=MS_TTS_VOICE_NAME, voiceRate=MS_TTS_VOICE_RATE, voiceVolume=MS_TTS_VOICE_VOLUME): """ do audio synthesis from unicode text if failed for token invalid/expired, will refresh token to do one more retry """ # global app, log, gCurBaiduRespDict global app, log isOk = False audioBinData = None errMsg = "" # # for debug # gCurBaiduRespDict["access_token"] = "99.569b3b5b470938a522ce60d2e2ea2506.2592000.1528015602.282335-11192483" log.info("doAudioSynthesis: unicodeText=%s", unicodeText) # isOk, audioBinData, errNo, errMsg = baiduText2Audio(unicodeText) isOk, audioBinData, errNo, errMsg = msTTS(unicodeText, voiceName, voiceRate, voiceVolume) log.info("isOk=%s, errNo=%d, errMsg=%s", isOk, errNo, errMsg) def msTTS(unicodeText, voiceName=MS_TTS_VOICE_NAME, voiceRate=MS_TTS_VOICE_RATE, voiceVolume=MS_TTS_VOICE_VOLUME): """call ms azure tts to generate audio(mp3/wav/...) from text""" global app, log, gMsToken log.info("msTTS: unicodeText=%s", unicodeText) isOk = False audioBinData = None errNo = 0 errMsg = "Unknown error" msTtsUrl = app.config["MS_TTS_URL"] log.info("msTtsUrl=%s", msTtsUrl) reqHeaders = { "Content-Type": "application/ssml+xml", "X-Microsoft-OutputFormat": MS_TTS_OUTPUT_FORMAT, "Ocp-Apim-Subscription-Key": app.config["MS_TTS_SECRET_KEY"], "Authorization": "Bear " + gMsToken } log.info("reqHeaders=%s", reqHeaders) # # for debug # MS_TTS_VOICE_NAME = "zhang san" ssmlDataStr = """ <speak version='1.0' xmlns="http://www.w3.org/2001/10/synthesis" xml:lang='en-US'> <voice name='%s'> <prosody rate='%s' volume='%s'> %s </prosody> </voice> </speak> """ % (voiceName, voiceRate, voiceVolume, unicodeText) log.info("ssmlDataStr=%s", ssmlDataStr) resp = requests.post(msTtsUrl, headers=reqHeaders, data=ssmlDataStr) log.info("resp=%s", resp) statusCode = resp.status_code log.info("statusCode=%s", statusCode) if statusCode == 200: # respContentType = resp.headers["Content-Type"] # 'audio/x-wav', 'audio/mpeg' # log.info("respContentType=%s", respContentType) # if re.match("audio/.*", respContentType): audioBinData = resp.content log.info("resp content is audio binary data, length=%d", len(audioBinData)) isOk = True errMsg = "" else: isOk = False errNo = resp.status_code errMsg = resp.reason log.error("resp errNo=%d, errMsg=%s", errNo, errMsg) # errNo=400, errMsg=Voice zhang san not supported # errNo=401, errMsg=Unauthorized # errNo=413, errMsg=Content length exceeded the allowed limit of 1024 characters. return isOk, audioBinData, errNo, errMsg class RobotQaAPI(Resource): def processResponse(self, respDict, voiceName=MS_TTS_VOICE_NAME, voiceRate=MS_TTS_VOICE_RATE, voiceVolume=MS_TTS_VOICE_VOLUME): """ process response dict before return generate audio for response text part """ global log, gTempAudioFolder unicodeText = respDict["data"]["response"]["text"] log.info("unicodeText=%s") if not unicodeText: log.info("No response text to do audio synthesis") return jsonify(respDict) isOk, audioBinData, errMsg = doAudioSynthesis(unicodeText, voiceName, voiceRate, voiceVolume) if isOk: # 1. save audio binary data into tmp file tempFilename = saveAudioDataToTmpFile(audioBinData) # 2. use celery to delay delete tmp file delayTimeToDelete = app.config["CELERY_DELETE_TMP_AUDIO_FILE_DELAY"] deleteTmpAudioFile.apply_async([tempFilename], countdown=delayTimeToDelete) log.info("Delay %s seconds to delete %s", delayTimeToDelete, tempFilename) # 3. generate temp audio file url tmpAudioUrl = "http://%s:%d/tmp/audio/%s" % ( app.config["FILE_URL_HOST"], app.config["FLASK_PORT"], tempFilename) log.info("tmpAudioUrl=%s", tmpAudioUrl) respDict["data"]["response"]["audioUrl"] = tmpAudioUrl else: log.warning("Fail to get synthesis audio for errMsg=%s", errMsg) log.info("respDict=%s", respDict) return jsonify(respDict) def get(self): respDict = { "code": 200, "message": "generate response ok", "data": { "input": "", "response": { "text": "", "audioUrl": "" }, "control": "", "audio": {} } } parser = reqparse.RequestParser() # i want to hear the story of Baby Sister Says No parser.add_argument('input', type=str, help="input words") parser.add_argument('voiceName', type=str, default=MS_TTS_VOICE_NAME, help="voice name/speaker") parser.add_argument('voiceRate', type=str, default=MS_TTS_VOICE_RATE, help="voice rate/speed") parser.add_argument('voiceVolume', type=str, default=MS_TTS_VOICE_VOLUME, help="voice volume") log.info("parser=%s", parser) parsedArgs = parser.parse_args() log.info("parsedArgs=%s", parsedArgs) if not parsedArgs: respDict["data"]["response"]["text"] = "Can not recognize input" return self.processResponse(respDict) inputStr = parsedArgs["input"] voiceName = parsedArgs["voiceName"] voiceRate = parsedArgs["voiceRate"] voiceVolume = parsedArgs["voiceVolume"] log.info("inputStr=%s, voiceName=%s, voiceRate=%s, voiceVolume=%s", inputStr, voiceName, voiceRate, voiceVolume) if not inputStr: respDict["data"]["response"]["text"] = "Can not recognize parameter input" return self.processResponse(respDict, voiceName, voiceRate, voiceVolume) respDict["data"]["input"] = inputStr aiResult = QueryAnalyse(inputStr, aiContext) log.info("aiResult=%s", aiResult) if aiResult["response"]: respDict["data"]["response"]["text"] = aiResult["response"] if aiResult["control"]: respDict["data"]["control"] = aiResult["control"] log.info('respDict["data"]=%s', respDict["data"]) audioFileIdStr = aiResult["mediaId"] log.info("audioFileIdStr=%s", audioFileIdStr) if audioFileIdStr: audioFileObjectId = ObjectId(audioFileIdStr) log.info("audioFileObjectId=%s", audioFileObjectId) if fsCollection.exists(audioFileObjectId): audioFileObj = fsCollection.get(audioFileObjectId) log.info("audioFileObj=%s", audioFileObj) encodedFilename = quote(audioFileObj.filename) log.info("encodedFilename=%s", encodedFilename) respDict["data"]["audio"] = { "contentType": audioFileObj.contentType, "name": audioFileObj.filename, "size": audioFileObj.length, "url": "http://%s:%d/files/%s/%s" % (app.config["FILE_URL_HOST"], app.config["FLASK_PORT"], audioFileObj._id, encodedFilename) } log.info("respDict=%s", respDict) return self.processResponse(respDict, voiceName, voiceRate, voiceVolume) else: log.info("Can not find file from id %s", audioFileIdStr) respDict["data"]["audio"] = {} return self.processResponse(respDict, voiceName, voiceRate, voiceVolume) else: log.info("Not response file id") respDict["data"]["audio"] = {} return self.processResponse(respDict, voiceName, voiceRate, voiceVolume) </code>
效果:
是可以实现,设置不同参数,输出对应合成的语音的:
然后再去部署代码到服务器上即可。