Spaces:
Running
on
L4
Running
on
L4
translate
Browse files
app.py
CHANGED
@@ -59,10 +59,10 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
|
|
59 |
from cosyvoice.utils.file_utils import load_wav, logging
|
60 |
from cosyvoice.utils.common import set_all_random_seed
|
61 |
|
62 |
-
inference_mode_list = ['3s
|
63 |
-
instruct_dict = {'3s
|
64 |
-
'
|
65 |
-
stream_mode_list = [('
|
66 |
max_val = 0.8
|
67 |
|
68 |
|
@@ -107,66 +107,65 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
107 |
else:
|
108 |
prompt_wav = None
|
109 |
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
110 |
-
if mode_checkbox_group in ['
|
111 |
if instruct_text == '':
|
112 |
-
gr.Warning('
|
113 |
yield (target_sr, default_data)
|
114 |
if prompt_wav is None:
|
115 |
-
gr.Info('
|
116 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
117 |
-
if mode_checkbox_group in ['
|
118 |
if cosyvoice.frontend.instruct is True:
|
119 |
-
gr.Warning('
|
120 |
yield (target_sr, default_data)
|
121 |
if instruct_text != '':
|
122 |
-
gr.Info('
|
123 |
if prompt_wav is None:
|
124 |
-
gr.Warning('
|
125 |
yield (target_sr, default_data)
|
126 |
-
gr.Info('
|
127 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
128 |
-
if mode_checkbox_group in ['3s
|
129 |
if prompt_wav is None:
|
130 |
-
gr.Warning('prompt
|
131 |
yield (target_sr, default_data)
|
132 |
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
133 |
-
gr.Warning('prompt
|
134 |
yield (target_sr, default_data)
|
135 |
# sft mode only use sft_dropdown
|
136 |
-
if mode_checkbox_group in ['
|
137 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
138 |
-
gr.Info('
|
139 |
# zero_shot mode only use prompt_wav prompt text
|
140 |
-
if mode_checkbox_group in ['3s
|
141 |
if prompt_text == '':
|
142 |
-
gr.Warning('prompt
|
143 |
yield (target_sr, default_data)
|
144 |
if instruct_text != '':
|
145 |
-
gr.Info('
|
146 |
info = torchaudio.info(prompt_wav)
|
147 |
if info.num_frames / info.sample_rate > 10:
|
148 |
-
gr.Warning('
|
149 |
yield (target_sr, default_data)
|
150 |
|
151 |
-
if mode_checkbox_group == '
|
152 |
logging.info('get sft inference request')
|
153 |
set_all_random_seed(seed)
|
154 |
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
155 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
156 |
-
elif mode_checkbox_group == '3s
|
157 |
logging.info('get zero_shot inference request')
|
158 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
159 |
set_all_random_seed(seed)
|
160 |
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
161 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
162 |
-
elif mode_checkbox_group == '
|
163 |
logging.info('get cross_lingual inference request')
|
164 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
165 |
set_all_random_seed(seed)
|
166 |
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
167 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
168 |
else:
|
169 |
-
logging.info('get instruct inference request')
|
170 |
logging.info('get instruct inference request')
|
171 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
172 |
set_all_random_seed(seed)
|
@@ -176,31 +175,31 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
|
|
176 |
|
177 |
def main():
|
178 |
with gr.Blocks() as demo:
|
179 |
-
gr.Markdown("###
|
180 |
-
|
181 |
[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
|
182 |
[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
|
183 |
[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
184 |
-
gr.Markdown("####
|
185 |
|
186 |
-
tts_text = gr.Textbox(label="
|
187 |
with gr.Row():
|
188 |
-
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='
|
189 |
-
instruction_text = gr.Text(label="
|
190 |
-
stream = gr.Radio(choices=stream_mode_list, label='
|
191 |
with gr.Column(scale=0.25):
|
192 |
seed_button = gr.Button(value="\U0001F3B2")
|
193 |
-
seed = gr.Number(value=0, label="
|
194 |
|
195 |
with gr.Row():
|
196 |
-
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='
|
197 |
-
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='
|
198 |
-
prompt_text = gr.Textbox(label="
|
199 |
-
instruct_text = gr.Textbox(label="
|
200 |
|
201 |
-
generate_button = gr.Button("
|
202 |
|
203 |
-
audio_output = gr.Audio(label="
|
204 |
|
205 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
206 |
generate_button.click(generate_audio,
|
|
|
59 |
from cosyvoice.utils.file_utils import load_wav, logging
|
60 |
from cosyvoice.utils.common import set_all_random_seed
|
61 |
|
62 |
+
inference_mode_list = ['3s Speedy Convertion', 'Natural Language Control']
|
63 |
+
instruct_dict = {'3s Speedy Convertion': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
|
64 |
+
'Natural Language Control': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
|
65 |
+
stream_mode_list = [('No', False), ('Yes', True)]
|
66 |
max_val = 0.8
|
67 |
|
68 |
|
|
|
107 |
else:
|
108 |
prompt_wav = None
|
109 |
# if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
|
110 |
+
if mode_checkbox_group in ['Natural Language Control']:
|
111 |
if instruct_text == '':
|
112 |
+
gr.Warning('You are using Natural Language Control mode, please input the instruct.')
|
113 |
yield (target_sr, default_data)
|
114 |
if prompt_wav is None:
|
115 |
+
gr.Info('You are using Natural Language Control mode, please upload the prompt audio.')
|
116 |
# if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
|
117 |
+
if mode_checkbox_group in ['Cross-lingual Convertion']:
|
118 |
if cosyvoice.frontend.instruct is True:
|
119 |
+
gr.Warning('You are using the cross-lingual Convertion mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
|
120 |
yield (target_sr, default_data)
|
121 |
if instruct_text != '':
|
122 |
+
gr.Info('You are using the cross-lingual Convertion mode. The instruct text will be ignored.')
|
123 |
if prompt_wav is None:
|
124 |
+
gr.Warning('You are using the cross-lingual Convertion mode. Please provide the prompt audio.')
|
125 |
yield (target_sr, default_data)
|
126 |
+
gr.Info('You are using the cross-lingual Convertion mode. Please ensure that the synthesis text and prompt text are in different languages.')
|
127 |
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
|
128 |
+
if mode_checkbox_group in ['3s Speedy Convertion', 'Cross-lingual Convertion']:
|
129 |
if prompt_wav is None:
|
130 |
+
gr.Warning('Empty prompt found, please check the prompt text.')
|
131 |
yield (target_sr, default_data)
|
132 |
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
|
133 |
+
gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
|
134 |
yield (target_sr, default_data)
|
135 |
# sft mode only use sft_dropdown
|
136 |
+
if mode_checkbox_group in ['Pretrained Voice']:
|
137 |
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
|
138 |
+
gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
|
139 |
# zero_shot mode only use prompt_wav prompt text
|
140 |
+
if mode_checkbox_group in ['3s Speedy Convertion']:
|
141 |
if prompt_text == '':
|
142 |
+
gr.Warning('Empty prompt found, please check the prompt text.')
|
143 |
yield (target_sr, default_data)
|
144 |
if instruct_text != '':
|
145 |
+
gr.Info('You are using 3s Speedy Convertion mode. Pretrained Voice/Instruct will be ingnored.')
|
146 |
info = torchaudio.info(prompt_wav)
|
147 |
if info.num_frames / info.sample_rate > 10:
|
148 |
+
gr.Warning('Please use prompt audio shorter than 10s.')
|
149 |
yield (target_sr, default_data)
|
150 |
|
151 |
+
if mode_checkbox_group == 'Pretrained Voice':
|
152 |
logging.info('get sft inference request')
|
153 |
set_all_random_seed(seed)
|
154 |
for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
|
155 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
156 |
+
elif mode_checkbox_group == '3s Speedy Convertion':
|
157 |
logging.info('get zero_shot inference request')
|
158 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
159 |
set_all_random_seed(seed)
|
160 |
for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
|
161 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
162 |
+
elif mode_checkbox_group == 'Cross-lingual Convertion':
|
163 |
logging.info('get cross_lingual inference request')
|
164 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
165 |
set_all_random_seed(seed)
|
166 |
for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
|
167 |
yield (target_sr, i['tts_speech'].numpy().flatten())
|
168 |
else:
|
|
|
169 |
logging.info('get instruct inference request')
|
170 |
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
|
171 |
set_all_random_seed(seed)
|
|
|
175 |
|
176 |
def main():
|
177 |
with gr.Blocks() as demo:
|
178 |
+
gr.Markdown("### Repo [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
|
179 |
+
Pretrained Model [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
|
180 |
[CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
|
181 |
[CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
|
182 |
[CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
|
183 |
+
gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
|
184 |
|
185 |
+
tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities. CosyVoice迎来全面升级,提供更准、更稳、更快、 更好的语音生成能力。")
|
186 |
with gr.Row():
|
187 |
+
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
|
188 |
+
instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
|
189 |
+
stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
|
190 |
with gr.Column(scale=0.25):
|
191 |
seed_button = gr.Button(value="\U0001F3B2")
|
192 |
+
seed = gr.Number(value=0, label="Random Seed")
|
193 |
|
194 |
with gr.Row():
|
195 |
+
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
|
196 |
+
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
|
197 |
+
prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
|
198 |
+
instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
|
199 |
|
200 |
+
generate_button = gr.Button("Speech Synthesis")
|
201 |
|
202 |
+
audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
|
203 |
|
204 |
seed_button.click(generate_seed, inputs=[], outputs=seed)
|
205 |
generate_button.click(generate_audio,
|