artificial-styletts2 / live_api.py
Dionyssos's picture
debug long sounds
3ac9f34
raw
history blame
3.64 kB
# -*- coding: utf-8 -*-
import numpy as np
import soundfile
import audresample
import text_utils
import re
import subprocess
import markdown
import json
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from flask_cors import CORS
from audiocraft.builders import AudioGen #, audio_write
NUM_SOUND_GENERATIONS = 1 # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()
# ====STYLE VECTOR====
# AFFECTIVE = True
# VOICE = 'en_UK/apope_low' # en_US/m-ailabs_low#mary_ann
# _dir = '/' if AFFECTIVE else '_v2/'
# precomputed_style_vector = msinference.compute_style(
# 'assets/wavs/style_vector' + _dir + VOICE.replace(
# '/', '_').replace(
# '#', '_').replace(
# 'cmu-arctic', 'cmu_arctic').replace(
# '_low', '') + '.wav')
# print('\n STYLE VECTOR \n', precomputed_style_vector.shape)
# ==== STYLE VECTOR
CACHE_DIR = 'flask_cache/'
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)
def tts_multi_sentence(scene=None):
if scene is not None and len(scene) >= 4:
print(f'Processing: {scene} ..')
# x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
x = sound_generator.generate(
[scene] * NUM_SOUND_GENERATIONS
).reshape(1, -1).detach().cpu().numpy() # bs, 11400
x /= np.abs(x).max() + 1e-7
# is 16kHz - AUdiogen Fs
x = audresample.resample(x,
original_rate=16000,
target_rate=24000)[0, :]
#
print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
else:
print(scene, '\nDrop\n')
x = np.zeros(400)
# # StyleTTS2
# if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
# assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
# x = []
# for _sentence in text:
# x.append(msinference.inference(_sentence,
# precomputed_style_vector,
# alpha=0.3,
# beta=0.7,
# diffusion_steps=7,
# embedding_scale=1))
# x = np.concatenate(x)
# return overlay(x, sound_background)
return x
app = Flask(__name__)
cors = CORS(app)
@app.route("/")
def index():
with open('README.md', 'r') as f:
return markdown.markdown(f.read())
@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
# https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
# object-into-a-representation-suitable-for-mongodb
r = request.form.to_dict(flat=False)
args = SimpleNamespace(
text=None if r.get('text') is None else r.get('text'), # string not file?
scene=r.get('scene')[0]
)
# print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
x = tts_multi_sentence(args.scene)
OUT_FILE = 'tmp.wav'
soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)
# send server's output as default file -> srv_result.xx
print(f'\n=SERVER saved as {OUT_FILE=}\n')
response = send_from_directory(CACHE_DIR, path=OUT_FILE)
response.headers['suffix-file-type'] = OUT_FILE
return response
if __name__ == "__main__":
app.run(host="0.0.0.0")