# -*- coding: utf-8 -*- import numpy as np import soundfile import audresample import text_utils import re import srt import subprocess import markdown import json from pathlib import Path from types import SimpleNamespace from flask import Flask, request, send_from_directory from flask_cors import CORS from audiocraft.audiogen import AudioGen, audio_write sound_generator = AudioGen.get_pretrained('facebook/audiogen-medium') sound_generator.set_generation_params(duration=4) # ====STYLE VECTOR==== # AFFECTIVE = True # VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann # _dir = '/' if AFFECTIVE else '_v2/' # precomputed_style_vector = msinference.compute_style( # 'assets/wavs/style_vector' + _dir + VOICE.replace( # '/', '_').replace( # '#', '_').replace( # 'cmu-arctic', 'cmu_arctic').replace( # '_low', '') + '.wav') # print('\n STYLE VECTOR \n', precomputed_style_vector.shape) # ==== STYLE VECTOR CACHE_DIR = 'flask_cache/' Path(CACHE_DIR).mkdir(parents=True, exist_ok=True) def tts_multi_sentence(scene=None): if scene is not None: sound_background = sound_generator.generate([scene])[0] sound_background = audio_write(None, sound_background.cpu(), 24000, # Same as styleTTs sample_rate, strategy="loudness", loudness_compressor=True).detach().cpu().numpy()[0, :] else: sound_background = None # # StyleTTS2 # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None): # assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.' # x = [] # for _sentence in text: # x.append(msinference.inference(_sentence, # precomputed_style_vector, # alpha=0.3, # beta=0.7, # diffusion_steps=7, # embedding_scale=1)) # x = np.concatenate(x) # return overlay(x, sound_background) return sound_background app = Flask(__name__) cors = CORS(app) @app.route("/") def index(): with open('README.md', 'r') as f: return markdown.markdown(f.read()) @app.route("/", methods=['GET', 'POST', 'PUT']) def serve_wav(): # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post- # object-into-a-representation-suitable-for-mongodb r = request.form.to_dict(flat=False) args = SimpleNamespace( text=None if r.get('text') is None else r.get('text'), # string not file? scene=r.get('scene')[0] ) # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==') x = tts_multi_sentence(args.scene) # print('\n\n\n\n Obtai TTS output shape', x.shape) OUT_FILE = 'tmp.wav' soundfile.write(CACHE_DIR + OUT_FILE, x, 24000) # send server's output as default file -> srv_result.xx print(f'\n=SERVER saved as {OUT_FILE=}\n') response = send_from_directory(CACHE_DIR, path=OUT_FILE) response.headers['suffix-file-type'] = OUT_FILE return response if __name__ == "__main__": app.run(host="0.0.0.0")