Spaces:

FunAudioLLM
/

CosyVoice2-0.5B

Running on L4

App Files Files Community

R1ckShi commited on Dec 17, 2024

Commit

da0a385

verified ·

1 Parent(s): d4d1fbd

translate

Browse files

Files changed (1) hide show

app.py +38 -39

app.py CHANGED Viewed

@@ -59,10 +59,10 @@ from cosyvoice.cli.cosyvoice import CosyVoice2
 from cosyvoice.utils.file_utils import load_wav, logging
 from cosyvoice.utils.common import set_all_random_seed
-inference_mode_list = ['3s极速复刻', '自然语言控制']
-instruct_dict = {'3s极速复刻': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入prompt文本\n3. 点击生成音频按钮',
-                 '自然语言控制': '1. 选择prompt音频文件，或录入prompt音频，注意不超过30s，若同时提供，优先选择prompt音频文件\n2. 输入instruct文本\n3. 点击生成音频按钮'}
-stream_mode_list = [('否', False), ('是', True)]
 max_val = 0.8
@@ -107,66 +107,65 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
     else:
         prompt_wav = None
     # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
-    if mode_checkbox_group in ['自然语言控制']:
         if instruct_text == '':
-            gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
             yield (target_sr, default_data)
         if prompt_wav is None:
-            gr.Info('您正在使用自然语言控制模式, 请输入prompt音频')
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
-    if mode_checkbox_group in ['跨语种复刻']:
         if cosyvoice.frontend.instruct is True:
-            gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用iic/CosyVoice-300M模型'.format(args.model_dir))
             yield (target_sr, default_data)
         if instruct_text != '':
-            gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
         if prompt_wav is None:
-            gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
             yield (target_sr, default_data)
-        gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
     # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
-    if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
         if prompt_wav is None:
-            gr.Warning('prompt音频为空，您是否忘记输入prompt音频？')
             yield (target_sr, default_data)
         if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
-            gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
             yield (target_sr, default_data)
     # sft mode only use sft_dropdown
-    if mode_checkbox_group in ['预训练音色']:
         if instruct_text != '' or prompt_wav is not None or prompt_text != '':
-            gr.Info('您正在使用预训练音色模式，prompt文本/prompt音频/instruct文本会被忽略！')
     # zero_shot mode only use prompt_wav prompt text
-    if mode_checkbox_group in ['3s极速复刻']:
         if prompt_text == '':
-            gr.Warning('prompt文本为空，您是否忘记输入prompt文本？')
             yield (target_sr, default_data)
         if instruct_text != '':
-            gr.Info('您正在使用3s极速复刻模式，预训练音色/instruct文本会被忽略！')
         info = torchaudio.info(prompt_wav)
         if info.num_frames / info.sample_rate > 10:
-            gr.Warning('请限制输入音频在10s内，避免推理效果过低')
             yield (target_sr, default_data)
-    if mode_checkbox_group == '预训练音色':
         logging.info('get sft inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
-    elif mode_checkbox_group == '3s极速复刻':
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
-    elif mode_checkbox_group == '跨语种复刻':
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     else:
-        logging.info('get instruct inference request')
         logging.info('get instruct inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
@@ -176,31 +175,31 @@ def generate_audio(tts_text, mode_checkbox_group, prompt_text, prompt_wav_upload
 def main():
     with gr.Blocks() as demo:
-        gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
-                    预训练模型 [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
                     [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
                     [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
                     [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
-        gr.Markdown("#### 请输入需要合成的文本，选择推理模式，并按照提示步骤进行操作")
-        tts_text = gr.Textbox(label="输入合成文本", lines=1, value="CosyVoice迎来全面升级，提供更准、更稳、更快、 更好的语音生成能力。CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities.")
         with gr.Row():
-            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
-            instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
-            stream = gr.Radio(choices=stream_mode_list, label='是否流式推理', value=stream_mode_list[0][1])
             with gr.Column(scale=0.25):
                 seed_button = gr.Button(value="\U0001F3B2")
-                seed = gr.Number(value=0, label="随机推理种子")
         with gr.Row():
-            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件，注意采样率不低于16khz')
-            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
-        prompt_text = gr.Textbox(label="prompt文本", lines=1, placeholder="请输入prompt文本，支持自动识别，您可以自行修正识别结果...", value='')
-        instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.例如:用四川话说这句话。", value='')
-        generate_button = gr.Button("生成音频")
-        audio_output = gr.Audio(label="合成音频", autoplay=True, streaming=True)
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,

 from cosyvoice.utils.file_utils import load_wav, logging
 from cosyvoice.utils.common import set_all_random_seed
+inference_mode_list = ['3s Speedy Convertion', 'Natural Language Control']
+instruct_dict = {'3s Speedy Convertion': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input prompt transcription\n3. click \'Speech Synthesis\' button',
+                 'Natural Language Control': '1. Upload prompt wav file (or record from mic), no longer than 30s, wav file will be used if provided at the same time\n2. Input instruct\n3. click \'Speech Synthesis\' button'}
+stream_mode_list = [('No', False), ('Yes', True)]
 max_val = 0.8
     else:
         prompt_wav = None
     # if instruct mode, please make sure that model is iic/CosyVoice-300M-Instruct and not cross_lingual mode
+    if mode_checkbox_group in ['Natural Language Control']:
         if instruct_text == '':
+            gr.Warning('You are using Natural Language Control mode, please input the instruct.')
             yield (target_sr, default_data)
         if prompt_wav is None:
+            gr.Info('You are using Natural Language Control mode, please upload the prompt audio.')
     # if cross_lingual mode, please make sure that model is iic/CosyVoice-300M and tts_text prompt_text are different language
+    if mode_checkbox_group in ['Cross-lingual Convertion']:
         if cosyvoice.frontend.instruct is True:
+            gr.Warning('You are using the cross-lingual Convertion mode. The {} model does not support this mode. Please use the iic/CosyVoice-300M model.'.format(args.model_dir))
             yield (target_sr, default_data)
         if instruct_text != '':
+            gr.Info('You are using the cross-lingual Convertion mode. The instruct text will be ignored.')
         if prompt_wav is None:
+            gr.Warning('You are using the cross-lingual Convertion mode. Please provide the prompt audio.')
             yield (target_sr, default_data)
+        gr.Info('You are using the cross-lingual Convertion mode. Please ensure that the synthesis text and prompt text are in different languages.')
     # if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
+    if mode_checkbox_group in ['3s Speedy Convertion', 'Cross-lingual Convertion']:
         if prompt_wav is None:
+            gr.Warning('Empty prompt found, please check the prompt text.')
             yield (target_sr, default_data)
         if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
+            gr.Warning('prompt wav sample rate {}, lower than {}.'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
             yield (target_sr, default_data)
     # sft mode only use sft_dropdown
+    if mode_checkbox_group in ['Pretrained Voice']:
         if instruct_text != '' or prompt_wav is not None or prompt_text != '':
+            gr.Info('You are using Pretrained Voice mode. Pretrained Voice/Instruct will be ingnored.')
     # zero_shot mode only use prompt_wav prompt text
+    if mode_checkbox_group in ['3s Speedy Convertion']:
         if prompt_text == '':
+            gr.Warning('Empty prompt found, please check the prompt text.')
             yield (target_sr, default_data)
         if instruct_text != '':
+            gr.Info('You are using 3s Speedy Convertion mode. Pretrained Voice/Instruct will be ingnored.')
         info = torchaudio.info(prompt_wav)
         if info.num_frames / info.sample_rate > 10:
+            gr.Warning('Please use prompt audio shorter than 10s.')
             yield (target_sr, default_data)
+    if mode_checkbox_group == 'Pretrained Voice':
         logging.info('get sft inference request')
         set_all_random_seed(seed)
         for i in cosyvoice.inference_sft(tts_text, sft_dropdown, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == '3s Speedy Convertion':
         logging.info('get zero_shot inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
+    elif mode_checkbox_group == 'Cross-lingual Convertion':
         logging.info('get cross_lingual inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
         for i in cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k, stream=stream, speed=speed):
             yield (target_sr, i['tts_speech'].numpy().flatten())
     else:
         logging.info('get instruct inference request')
         prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
         set_all_random_seed(seed)
 def main():
     with gr.Blocks() as demo:
+        gr.Markdown("### Repo [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) \
+                    Pretrained Model [CosyVoice2-0.5B](https://www.modelscope.cn/models/iic/CosyVoice2-0.5B) \
                     [CosyVoice-300M](https://www.modelscope.cn/models/iic/CosyVoice-300M) \
                     [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/iic/CosyVoice-300M-Instruct) \
                     [CosyVoice-300M-SFT](https://www.modelscope.cn/models/iic/CosyVoice-300M-SFT)")
+        gr.Markdown("#### Please input the text to synthesize, choose inference mode and follow the controlling steps below.")
+        tts_text = gr.Textbox(label="Text to synthesize", lines=1, value="CosyVoice is undergoing a comprehensive upgrade, providing more accurate, stable, faster, and better voice generation capabilities. CosyVoice迎来全面升级，提供更准、更稳、更快、 更好的语音生成能力。")
         with gr.Row():
+            mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='Inference Mode', value=inference_mode_list[0])
+            instruction_text = gr.Text(label="Instructions", value=instruct_dict[inference_mode_list[0]], scale=0.5)
+            stream = gr.Radio(choices=stream_mode_list, label='Streaming or not', value=stream_mode_list[0][1])
             with gr.Column(scale=0.25):
                 seed_button = gr.Button(value="\U0001F3B2")
+                seed = gr.Number(value=0, label="Random Seed")
         with gr.Row():
+            prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='Prompt wav file (sample rate >= 16kHz)')
+            prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='Record prompt from your microphone')
+        prompt_text = gr.Textbox(label="Prompt Transcription", lines=1, placeholder="Prompt transcription (auto ASR, you can correct the recognition results)", value='')
+        instruct_text = gr.Textbox(label="Instruct", lines=1, placeholder="Instruct transcription. e.g. A old sea captain, navigates life's storms with timeless wisdom and a heart of gold.", value='')
+        generate_button = gr.Button("Speech Synthesis")
+        audio_output = gr.Audio(label="Audio Output", autoplay=True, streaming=True)
         seed_button.click(generate_seed, inputs=[], outputs=seed)
         generate_button.click(generate_audio,