import spaces  # Import spaces first to avoid CUDA initialization issues
import gradio as gr
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import tempfile

# Load model and tokenizers at startup (on CPU initially)
print("Loading model and tokenizers...")
model = ParlerTTSForConditionalGeneration.from_pretrained("ai4bharat/indic-parler-tts").to("cpu")
tokenizer = AutoTokenizer.from_pretrained("ai4bharat/indic-parler-tts")
description_tokenizer = AutoTokenizer.from_pretrained(model.config.text_encoder._name_or_path)
print("Model and tokenizers loaded.")

# Supported languages and default settings
languages = {
    "Urdu": "A female speaker delivers a clear and expressive speech in Urdu.",
    "Punjabi": "A female speaker delivers a clear and expressive speech in Punjabi.",
    "Sindhi": "A female speaker delivers a clear and expressive speech in Sindhi.",
}
emotions = [
    "Neutral", "Happy", "Sad", "Anger", "Command", "Narration", "Conversation",
    "Disgust", "Fear", "News", "Proper Noun", "Surprise"
]
default_language = "Urdu"
default_gender = "Female"
default_emotion = "Neutral"

# Generate description function
def generate_description(language, gender, emotion, noise, reverb, expressivity, pitch, rate, quality):
    description = (
        f"A {gender.lower()} speaker delivers a {emotion.lower()} and {expressivity.lower()} speech "
        f"with a {pitch.lower()} pitch and a {rate.lower()} speaking rate. "
        f"The audio has {noise.lower()} background noise, {reverb.lower()} reverberation, "
        f"and {quality.lower()} voice quality. The text is in {language}."
    )
    return description

# Generate audio function with GPU allocation
@spaces.GPU  # Allocate GPU for the duration of this function
def generate_audio(text, description):
    global model  # Access the preloaded model

    # Move model to GPU
    model.to("cuda")

    # Prepare model inputs
    input_ids = description_tokenizer(description, return_tensors="pt").input_ids.to("cuda")
    prompt_input_ids = tokenizer(text, return_tensors="pt").input_ids.to("cuda")

    # Generate audio
    generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
    audio_arr = generation.cpu().numpy().squeeze()

    # Save audio to a temporary file
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
        sf.write(f.name, audio_arr, model.config.sampling_rate)
        audio_path = f.name

    # Move model back to CPU to free GPU memory
    model.to("cpu")

    return audio_path

# Gradio Interface
def app():
    with gr.Blocks() as demo:
        gr.Markdown("# Indic Parler-TTS for Urdu, Punjabi, and Sindhi")
        gr.Markdown("Select language, speaker gender, emotion, and customize speech characteristics.")
        
        with gr.Row():
            lang_dropdown = gr.Dropdown(
                choices=list(languages.keys()), 
                value=default_language, 
                label="Select Language"
            )
            gender_dropdown = gr.Dropdown(
                choices=["Male", "Female"], 
                value=default_gender, 
                label="Speaker Gender"
            )
            emotion_dropdown = gr.Dropdown(
                choices=emotions, 
                value=default_emotion, 
                label="Select Emotion"
            )
        
        with gr.Row():
            noise_dropdown = gr.Dropdown(
                choices=["Clear", "Slightly Noisy"], 
                value="Clear", 
                label="Background Noise"
            )
            reverb_dropdown = gr.Dropdown(
                choices=["Close-Sounding", "Distant-Sounding"], 
                value="Close-Sounding", 
                label="Reverberation"
            )
            expressivity_dropdown = gr.Dropdown(
                choices=["Expressive", "Slightly Expressive", "Monotone"], 
                value="Expressive", 
                label="Expressivity"
            )
            pitch_dropdown = gr.Dropdown(
                choices=["High", "Low", "Balanced"], 
                value="Balanced", 
                label="Pitch"
            )
            rate_dropdown = gr.Dropdown(
                choices=["Slow", "Moderate", "Fast"], 
                value="Moderate", 
                label="Speaking Rate"
            )
            quality_dropdown = gr.Dropdown(
                choices=["Basic", "Refined"], 
                value="Refined", 
                label="Voice Quality"
            )
        
        with gr.Row():
            text_input = gr.Textbox(
                label="Enter Text",
                placeholder="Type your text here...",
                lines=5
            )
        
        with gr.Row():
            generate_caption_button = gr.Button("Generate Caption/Description")
            caption_output = gr.Textbox(
                label="Generated Caption/Description",
                placeholder="The generated caption will appear here...",
                lines=5
            )
        
        with gr.Row():
            generate_audio_button = gr.Button("Generate Speech")
            audio_output = gr.Audio(label="Generated Audio")

        # Link actions to buttons
        generate_caption_button.click(
            fn=generate_description, 
            inputs=[
                lang_dropdown, gender_dropdown, emotion_dropdown,
                noise_dropdown, reverb_dropdown, expressivity_dropdown,
                pitch_dropdown, rate_dropdown, quality_dropdown
            ], 
            outputs=caption_output
        )
        
        generate_audio_button.click(
            fn=generate_audio, 
            inputs=[text_input, caption_output], 
            outputs=audio_output
        )
        
    return demo

# Run the app
app().launch()