Spaces:

naonauno
/

dialogs2-factory

Paused

App Files Files Community

dialogs2-factory / app.py

naonauno

Update app.py

e366fb9 verified 16 days ago

raw

history blame contribute delete

9.66 kB

	import os
	import sys
	import torch
	import gradio as gr
	from pydub import AudioSegment
	import mimetypes

	sys.path.append('./Amphion')
	import Amphion.models.vc.vevo.vevo_utils as vevo_utils
	from huggingface_hub import snapshot_download

	def load_model():
	print("Loading model...")
	device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
	print(f"Using device: {device}")

	cache_dir = "./ckpts/Vevo"
	os.makedirs(cache_dir, exist_ok=True)

	# Content Tokenizer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir=cache_dir,
	allow_patterns=["tokenizer/vq32/*"],
	)
	content_tokenizer_ckpt_path = os.path.join(
	local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
	)

	# Content-Style Tokenizer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir=cache_dir,
	allow_patterns=["tokenizer/vq8192/*"],
	)
	content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")

	# Autoregressive Transformer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir=cache_dir,
	allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
	)
	ar_cfg_path = "./config/Vq32ToVq8192.json"
	ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")

	# Flow Matching Transformer
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir=cache_dir,
	allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
	)
	fmt_cfg_path = "./config/Vq8192ToMels.json"
	fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")

	# Vocoder
	local_dir = snapshot_download(
	repo_id="amphion/Vevo",
	repo_type="model",
	cache_dir=cache_dir,
	allow_patterns=["acoustic_modeling/Vocoder/*"],
	)
	vocoder_cfg_path = "./Amphion/models/vc/vevo/config/Vocoder.json"
	vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")

	print("Initializing pipeline...")
	pipeline = vevo_utils.VevoInferencePipeline(
	content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
	content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
	ar_cfg_path=ar_cfg_path,
	ar_ckpt_path=ar_ckpt_path,
	fmt_cfg_path=fmt_cfg_path,
	fmt_ckpt_path=fmt_ckpt_path,
	vocoder_cfg_path=vocoder_cfg_path,
	vocoder_ckpt_path=vocoder_ckpt_path,
	device=device
	)
	print("Model loaded successfully!")
	return pipeline

	def convert_to_wav(audio_path):
	if audio_path is None:
	return None

	mime, _ = mimetypes.guess_type(audio_path)
	if mime == 'audio/wav' or mime == 'audio/x-wav':
	return audio_path
	elif mime == 'audio/mpeg':
	seg = AudioSegment.from_mp3(audio_path)
	wav_path = audio_path.rsplit('.', 1)[0] + '.wav'
	seg.export(wav_path, format="wav")
	return wav_path
	else:
	raise ValueError(f"Unsupported audio format: {mime}")

	def process_audio(mode, content_audio, ref_style_audio, ref_timbre_audio,
	src_text, ref_text, src_language, ref_language, steps,
	progress=gr.Progress()):
	try:
	output_dir = "outputs"
	os.makedirs(output_dir, exist_ok=True)
	output_path = os.path.join(output_dir, "output.wav")

	# Convert uploaded audio files to WAV if needed
	if content_audio:
	content_path = convert_to_wav(content_audio)
	else:
	content_path = None

	if ref_style_audio:
	ref_style_path = convert_to_wav(ref_style_audio)
	else:
	ref_style_path = None

	if ref_timbre_audio:
	ref_timbre_path = convert_to_wav(ref_timbre_audio)
	else:
	ref_timbre_path = None

	progress(0.2, "Processing audio...")

	# Run inference based on mode
	if mode == 'voice':
	if not all([content_path, ref_style_path, ref_timbre_path]):
	raise gr.Error("Voice mode requires all audio inputs")

	gen_audio = inference_pipeline.inference_ar_and_fm(
	src_wav_path=content_path,
	src_text=None,
	style_ref_wav_path=ref_style_path,
	timbre_ref_wav_path=ref_timbre_path,
	flow_matching_steps=steps
	)

	elif mode == 'timbre':
	if not all([content_path, ref_timbre_path]):
	raise gr.Error("Timbre mode requires source and timbre reference audio")

	gen_audio = inference_pipeline.inference_fm(
	src_wav_path=content_path,
	timbre_ref_wav_path=ref_timbre_path,
	flow_matching_steps=steps
	)

	elif mode == 'tts':
	if not all([ref_style_path, ref_timbre_path]) or not src_text:
	raise gr.Error("TTS mode requires style audio, timbre audio, and source text")

	gen_audio = inference_pipeline.inference_ar_and_fm(
	src_wav_path=None,
	src_text=src_text,
	style_ref_wav_path=ref_style_path,
	timbre_ref_wav_path=ref_timbre_path,
	style_ref_wav_text=ref_text if ref_text else None,
	src_text_language=src_language,
	style_ref_wav_text_language=ref_language
	)

	progress(0.8, "Saving generated audio...")

	# Save and return the generated audio
	vevo_utils.save_audio(gen_audio, target_sample_rate=48000, output_path=output_path)
	return output_path

	except Exception as e:
	raise gr.Error(str(e))

	# Initialize the model
	inference_pipeline = load_model()

	# Create the Gradio interface
	with gr.Blocks(title="Vevo Voice Conversion") as demo:
	gr.Markdown("# Vevo Voice Conversion")

	with gr.Row():
	mode = gr.Radio(
	choices=["voice", "timbre", "tts"],
	value="timbre",
	label="Inference Mode",
	interactive=True
	)

	with gr.Tabs():
	with gr.TabItem("Audio Inputs"):
	content_audio = gr.Audio(
	label="Source Audio",
	type="filepath",
	interactive=True
	)

	ref_style_audio = gr.Audio(
	label="Reference Style Audio",
	type="filepath",
	interactive=True
	)

	ref_timbre_audio = gr.Audio(
	label="Reference Timbre Audio",
	type="filepath",
	interactive=True
	)

	with gr.TabItem("Text Inputs (TTS Mode)"):
	src_text = gr.Textbox(
	label="Source Text",
	placeholder="Enter text for TTS mode",
	interactive=True
	)

	ref_text = gr.Textbox(
	label="Reference Style Text (Optional)",
	placeholder="Enter reference text",
	interactive=True
	)

	with gr.Row():
	src_language = gr.Dropdown(
	choices=["en", "zh"],
	value="en",
	label="Source Language",
	interactive=True
	)

	ref_language = gr.Dropdown(
	choices=["en", "zh"],
	value="en",
	label="Reference Language",
	interactive=True
	)

	with gr.Row():
	steps = gr.Slider(
	minimum=1,
	maximum=64,
	value=32,
	step=1,
	label="Flow Matching Steps"
	)

	with gr.Row():
	with gr.Column():
	submit_btn = gr.Button("Generate")
	error_box = gr.Textbox(label="Status", interactive=False)
	output_audio = gr.Audio(label="Generated Audio")

	def process_with_error_handling(*args):
	try:
	result = process_audio(*args)
	error_box.update(value="Success!")
	return [result, "Success!"]
	except Exception as e:
	error_msg = str(e)
	return [None, error_msg]

	submit_btn.click(
	fn=process_with_error_handling,
	inputs=[
	mode,
	content_audio,
	ref_style_audio,
	ref_timbre_audio,
	src_text,
	ref_text,
	src_language,
	ref_language,
	steps
	],
	outputs=[output_audio, error_box]
	)

	# Example usage text
	gr.Markdown("""
	## Quick Start Guide

	1. Select your mode:
	- Voice: Full voice conversion (style + timbre)
	- Timbre: Only voice timbre conversion
	- TTS: Text-to-speech with voice cloning

	2. For Voice/Timbre modes:
	- Upload source audio (what you want to convert)
	- Upload reference audio(s)

	3. For TTS mode:
	- Enter your text
	- Select language
	- Upload reference audio(s)

	4. Adjust steps slider:
	- Higher values = better quality but slower
	- Lower values = faster but lower quality

	5. Click Generate and wait for processing
	""")

	if __name__ == "__main__":
	demo.queue().launch()