import spaces # Import spaces first to avoid CUDA initialization issues import gradio as gr import torch from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer import soundfile as sf import tempfile # Load model and tokenizers at startup (on CPU initially) print("Loading model and tokenizers...") model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu") tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts") description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path) print("Model and tokenizers loaded.") # Supported languages and default settings languages = { "Urdu": "A female speaker delivers a clear and expressive speech in Urdu.", "Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.", "Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.", } emotions = [ "Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation", "Disgust", "Fear", "News", "Proper Noun", "Surprise" ] default_language = "Urdu" default_gender = "Female" default_emotion = "Neutral" # Generate description function def generate_description(language, gender, emotion, noise, reverb, expressivity, pitch, rate, quality): description = ( f"A {gender.lower()} speaker delivers a {emotion.lower()} and {expressivity.lower()} speech " f"with a {pitch.lower()} pitch and a {rate.lower()} speaking rate. " f"The audio has {noise.lower()} background noise, {reverb.lower()} reverberation, " f"and {quality.lower()} voice quality. The text is in {language}." ) return description # Generate audio function with GPU allocation @spaces.GPU # Allocate GPU for the duration of this function def generate_audio(text, description): global model # Access the preloaded model # Move model to GPU model.to("cuda") # Prepare model inputs input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to("cuda") prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda") # Generate audio generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids) audio_arr = generation.cpu().numpy().squeeze() # Save audio to a temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: sf.write(f.name, audio_arr, model.config.sampling_rate) audio_path = f.name # Move model back to CPU to free GPU memory model.to("cpu") return audio_path # Gradio Interface def app(): with gr.Blocks() as demo: gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi") gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.") with gr.Row(): lang_dropdown = gr.Dropdown( choices=list(languages.keys()), value=default_language, label="Select Language" ) gender_dropdown = gr.Dropdown( choices=["Male", "Female"], value=default_gender, label="Speaker Gender" ) emotion_dropdown = gr.Dropdown( choices=emotions, value=default_emotion, label="Select Emotion" ) with gr.Row(): noise_dropdown = gr.Dropdown( choices=["Clear", "Slightly Noisy"], value="Clear", label="Background Noise" ) reverb_dropdown = gr.Dropdown( choices=["Close-Sounding", "Distant-Sounding"], value="Close-Sounding", label="Reverberation" ) expressivity_dropdown = gr.Dropdown( choices=["Expressive", "Slightly Expressive", "Monotone"], value="Expressive", label="Expressivity" ) pitch_dropdown = gr.Dropdown( choices=["High", "Low", "Balanced"], value="Balanced", label="Pitch" ) rate_dropdown = gr.Dropdown( choices=["Slow", "Moderate", "Fast"], value="Moderate", label="Speaking Rate" ) quality_dropdown = gr.Dropdown( choices=["Basic", "Refined"], value="Refined", label="Voice Quality" ) with gr.Row(): text_input = gr.Textbox( label="Enter Text", placeholder="Type your text here...", lines=5 ) with gr.Row(): generate_caption_button = gr.Button("Generate Caption/Description") caption_output = gr.Textbox( label="Generated Caption/Description", placeholder="The generated caption will appear here...", lines=5 ) with gr.Row(): generate_audio_button = gr.Button("Generate Speech") audio_output = gr.Audio(label="Generated Audio") # Link actions to buttons generate_caption_button.click( fn=generate_description, inputs=[ lang_dropdown, gender_dropdown, emotion_dropdown, noise_dropdown, reverb_dropdown, expressivity_dropdown, pitch_dropdown, rate_dropdown, quality_dropdown ], outputs=caption_output ) generate_audio_button.click( fn=generate_audio, inputs=[text_input, caption_output], outputs=audio_output ) return demo # Run the app app().launch()