Add token authentication from space secrets and initial Piper synthesis code.
Browse files- src/synthesize.py +26 -2
src/synthesize.py
CHANGED
@@ -29,7 +29,7 @@ def synth_mms(text:str, model:str):
|
|
29 |
# raw_response=True)._content
|
30 |
|
31 |
if model is not None:
|
32 |
-
pipe = pipeline("text-to-speech", model=model, device=-1) # Change device if it should use GPU
|
33 |
mms_tts = pipe(text)
|
34 |
return mms_tts['audio'], mms_tts['sampling_rate']
|
35 |
else:
|
@@ -100,7 +100,7 @@ def synth_toucan(text:str, model:str):
|
|
100 |
Returns:
|
101 |
Streaming Wav and sampling rate.
|
102 |
|
103 |
-
|
104 |
'''
|
105 |
client = Client("Flux9665/MassivelyMultilingualTTS")
|
106 |
result = client.predict(
|
@@ -115,4 +115,28 @@ def synth_toucan(text:str, model:str):
|
|
115 |
api_name="/predict"
|
116 |
)
|
117 |
sampling_rate, wav = wavfile.read(result[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
118 |
return wav, sampling_rate
|
|
|
29 |
# raw_response=True)._content
|
30 |
|
31 |
if model is not None:
|
32 |
+
pipe = pipeline("text-to-speech", model=model, device=-1, token=os.environ['TOKEN']) # Change device if it should use GPU
|
33 |
mms_tts = pipe(text)
|
34 |
return mms_tts['audio'], mms_tts['sampling_rate']
|
35 |
else:
|
|
|
100 |
Returns:
|
101 |
Streaming Wav and sampling rate.
|
102 |
|
103 |
+
NOTES: (1)This wrapper does not let you explore the full range of options possible with the API. (2) The API should allow you to generate female voices, however, it does not seem to be working at the moment. (3) This uses a Huggingface Gradio Space to compute via the API.
|
104 |
'''
|
105 |
client = Client("Flux9665/MassivelyMultilingualTTS")
|
106 |
result = client.predict(
|
|
|
115 |
api_name="/predict"
|
116 |
)
|
117 |
sampling_rate, wav = wavfile.read(result[0])
|
118 |
+
return wav, sampling_rate
|
119 |
+
|
120 |
+
def synth_piper(text:str, model:str):
|
121 |
+
'''
|
122 |
+
Use Toucan to synthesize text.
|
123 |
+
|
124 |
+
Inputs:
|
125 |
+
text: Text to synthesze
|
126 |
+
model: Model code
|
127 |
+
Returns:
|
128 |
+
Streaming Wav and sampling rate.
|
129 |
+
|
130 |
+
NOTES: (1) This uses a Huggingface Gradio Space to compute via the API.
|
131 |
+
'''
|
132 |
+
client = Client("k2-fsa/text-to-speech")
|
133 |
+
result = client.predict(
|
134 |
+
language=model[0],
|
135 |
+
repo_id=model[1],
|
136 |
+
text=text,
|
137 |
+
sid="0",
|
138 |
+
speed=1,
|
139 |
+
api_name="/process"
|
140 |
+
)
|
141 |
+
sampling_rate, wav = wavfile.read(result[0])
|
142 |
return wav, sampling_rate
|