# Copyright (c) 2023 Amphion. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. import gradio as gr import os import inference SUPPORTED_TARGET_SINGERS = { "Adele": "vocalist_l1_Adele", "Beyonce": "vocalist_l1_Beyonce", "Bruno Mars": "vocalist_l1_BrunoMars", "John Mayer": "vocalist_l1_JohnMayer", "Michael Jackson": "vocalist_l1_MichaelJackson", "Taylor Swift": "vocalist_l1_TaylorSwift", "Jacky Cheung 张学友": "vocalist_l1_张学友", "Jian Li 李健": "vocalist_l1_李健", "Feng Wang 汪峰": "vocalist_l1_汪峰", "Faye Wong 王菲": "vocalist_l1_王菲", "Yijie Shi 石倚洁": "vocalist_l1_石倚洁", "Tsai Chin 蔡琴": "vocalist_l1_蔡琴", "Ying Na 那英": "vocalist_l1_那英", "Eason Chan 陈奕迅": "vocalist_l1_陈奕迅", "David Tao 陶喆": "vocalist_l1_陶喆", } def svc_inference( source_audio_path, target_singer, key_shift_mode="Auto Shift", key_shift_num=0, diffusion_steps=1000, ): #### Prepare source audio file #### print("source_audio_path: {}".format(source_audio_path)) audio_file = source_audio_path.split("/")[-1] audio_name = audio_file.split(".")[0] source_audio_dir = source_audio_path.replace(audio_file, "") ### Target Singer ### target_singer = SUPPORTED_TARGET_SINGERS[target_singer] ### Inference ### if key_shift_mode == "Auto Shift": key_shift = "autoshift" else: key_shift = key_shift_num args_list = ["--config", "ckpts/svc/vocalist_l1_contentvec+whisper/args.json"] args_list += ["--acoustics_dir", "ckpts/svc/vocalist_l1_contentvec+whisper"] args_list += ["--vocoder_dir", "pretrained/bigvgan"] args_list += ["--target_singer", target_singer] args_list += ["--trans_key", str(key_shift)] args_list += ["--diffusion_inference_steps", str(diffusion_steps)] args_list += ["--source", source_audio_dir] args_list += ["--output_dir", "result"] args_list += ["--log_level", "debug"] os.environ["WORK_DIR"] = "./" inference.main(args_list) ### Display ### result_file = os.path.join( "result/{}/{}_{}.wav".format(audio_name, audio_name, target_singer) ) return result_file with gr.Blocks() as demo: gr.Markdown( """ # Amphion Singing Voice Conversion: *DiffWaveNetSVC* [![arXiv](https://img.shields.io/badge/arXiv-Paper-.svg)](https://arxiv.org/abs/2310.11160) This demo provides an Amphion [DiffWaveNetSVC](https://github.com/open-mmlab/Amphion/tree/main/egs/svc/MultipleContentsSVC) pretrained model for you to play. The training data has been detailed [here](https://huggingface.co/amphion/singing_voice_conversion). """ ) gr.Markdown( """ ## Source Audio **Hint**: We recommend using dry vocals (e.g., studio recordings or source-separated voices from music) as the input for this demo. At the bottom of this page, we provide some examples for your reference. """ ) source_audio_input = gr.Audio( sources=["upload", "microphone"], label="Source Audio", type="filepath", ) with gr.Row(): with gr.Column(): config_target_singer = gr.Radio( choices=list(SUPPORTED_TARGET_SINGERS.keys()), label="Target Singer", value="Jian Li 李健", ) config_keyshift_choice = gr.Radio( choices=["Auto Shift", "Key Shift"], value="Auto Shift", label="Pitch Shift Control", info='If you want to control the specific pitch shift value, you need to choose "Key Shift"', ) # gr.Markdown("## Conversion Configurations") with gr.Column(): config_keyshift_value = gr.Slider( -6, 6, value=0, step=1, label="Key Shift Values", info='How many semitones you want to transpose. This parameter will work only if you choose "Key Shift"', ) config_diff_infer_steps = gr.Slider( 1, 1000, value=1000, step=1, label="Diffusion Inference Steps", info="As the step number increases, the synthesis quality will be better while the inference speed will be lower", ) btn = gr.ClearButton( components=[ config_target_singer, config_keyshift_choice, config_keyshift_value, config_diff_infer_steps, ] ) btn = gr.Button(value="Submit", variant="primary") gr.Markdown("## Conversion Result") demo_outputs = gr.Audio(label="Conversion Result") btn.click( fn=svc_inference, inputs=[ source_audio_input, config_target_singer, config_keyshift_choice, config_keyshift_value, config_diff_infer_steps, ], outputs=demo_outputs, ) gr.Markdown("## Examples") gr.Examples( examples=[ [ "examples/chinese_female_recordings.wav", "John Mayer", "Auto Shift", 1000, "examples/output/chinese_female_recordings_vocalist_l1_JohnMayer.wav", ], [ "examples/chinese_male_seperated.wav", "Taylor Swift", "Auto Shift", 1000, "examples/output/chinese_male_seperated_vocalist_l1_TaylorSwift.wav", ], [ "examples/english_female_seperated.wav", "Feng Wang 汪峰", "Auto Shift", 1000, "examples/output/english_female_seperated_vocalist_l1_汪峰.wav", ], [ "examples/english_male_recordings.wav", "Yijie Shi 石倚洁", "Auto Shift", 1000, "examples/output/english_male_recordings_vocalist_l1_石倚洁.wav", ], ], inputs=[ source_audio_input, config_target_singer, config_keyshift_choice, config_diff_infer_steps, demo_outputs, ], ) if __name__ == "__main__": demo.launch()