File size: 3,640 Bytes
6e78f43
 
 
 
 
 
c4effd2
6e78f43
 
 
 
 
 
 
 
6ab316b
3ac9f34
6ab316b
c4effd2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6e78f43
 
 
 
 
c4effd2
 
 
2d0e2b6
 
3ac9f34
 
 
 
6e78f43
2d0e2b6
2e6c69d
 
1766442
2e6c69d
 
 
 
2d0e2b6
6e78f43
2d0e2b6
 
6e78f43
c4effd2
 
 
 
 
 
 
 
 
 
 
 
6e78f43
c4effd2
6e78f43
2d0e2b6
6e78f43
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c4effd2
2d0e2b6
6e78f43
2d0e2b6
6e78f43
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136

# -*- coding: utf-8 -*-
import numpy as np
import soundfile
import audresample
import text_utils

import re
import subprocess
import markdown
import json
from pathlib import Path
from types import SimpleNamespace
from flask import Flask, request, send_from_directory
from flask_cors import CORS
from audiocraft.builders import AudioGen #, audio_write
NUM_SOUND_GENERATIONS = 1  # they differ a lot and are unnatural to concatenate, prefer lm.n_draw
sound_generator = AudioGen(duration=.74, device='cuda:0').to('cuda:0').eval()

    
# ====STYLE VECTOR====



# AFFECTIVE = True
# VOICE = 'en_UK/apope_low'  #  	en_US/m-ailabs_low#mary_ann

# _dir = '/' if AFFECTIVE else '_v2/'
# precomputed_style_vector = msinference.compute_style(
#     'assets/wavs/style_vector' + _dir + VOICE.replace(
#         '/', '_').replace(
#         '#', '_').replace(
#         'cmu-arctic', 'cmu_arctic').replace(
#         '_low', '') + '.wav')
# print('\n  STYLE VECTOR \n', precomputed_style_vector.shape)


# ==== STYLE VECTOR 

CACHE_DIR = 'flask_cache/'
Path(CACHE_DIR).mkdir(parents=True, exist_ok=True)




def tts_multi_sentence(scene=None):
    if scene is not None and len(scene) >= 4:
        print(f'Processing: {scene} ..')
        # x = sound_generator.generate([scene])[0, :, :].detach().cpu().numpy()
        x = sound_generator.generate(
                                        [scene] * NUM_SOUND_GENERATIONS
                                        ).reshape(1, -1).detach().cpu().numpy() # bs, 11400
        
        x /= np.abs(x).max() + 1e-7
        # is 16kHz - AUdiogen Fs
        x = audresample.resample(x,
            original_rate=16000,
            target_rate=24000)[0, :]
        
            
        #
        print(f'Craft Finished for: {scene}\n\n\n\n____{x.shape}')
    else:
        print(scene, '\nDrop\n')
        x = np.zeros(400)
        
    # # StyleTTS2
    # if ('en_US/' in voice) or ('en_UK/' in voice) or (voice is None):
    #     assert precomputed_style_vector is not None, 'For affective TTS, style vector is needed.'
    #     x = []
    #     for _sentence in text:
    #         x.append(msinference.inference(_sentence,
    #                     precomputed_style_vector,
    #                                 alpha=0.3,
    #                                 beta=0.7,
    #                                 diffusion_steps=7,
    #                                 embedding_scale=1))
    #     x = np.concatenate(x)
        
    #     return overlay(x, sound_background)
    
    return x
    
    




app = Flask(__name__)
cors = CORS(app)


@app.route("/")
def index():
    with open('README.md', 'r') as f:
        return markdown.markdown(f.read())


@app.route("/", methods=['GET', 'POST', 'PUT'])
def serve_wav():
    # https://stackoverflow.com/questions/13522137/in-flask-convert-form-post-
    #                      object-into-a-representation-suitable-for-mongodb
    r = request.form.to_dict(flat=False)
    

    args = SimpleNamespace(
        text=None if r.get('text') is None else r.get('text'),  # string not file?
        scene=r.get('scene')[0]
        )
    # print('\n==RECOMPOSED as \n',request.data,request.form,'\n==')
    




    
    
    x = tts_multi_sentence(args.scene)
    
    OUT_FILE = 'tmp.wav'
    soundfile.write(CACHE_DIR + OUT_FILE, x, 16000)


    
    
    
    # send server's output as default file -> srv_result.xx
    print(f'\n=SERVER saved as {OUT_FILE=}\n')
    response = send_from_directory(CACHE_DIR, path=OUT_FILE)
    response.headers['suffix-file-type'] = OUT_FILE
    return response


if __name__ == "__main__":
    app.run(host="0.0.0.0")