File size: 5,007 Bytes
bf642c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import os
import gradio as gr
import outetts
from outetts.version.v1.interface import _DEFAULT_SPEAKERS

model_config = outetts.HFModelConfig_v1(
    model_path="OuteAI/OuteTTS-0.2-500M",
    language="en",
)
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config)

def get_available_speakers(language):
    """Get available speakers for the selected language."""
    if language not in interface.languages:
        return []
    speakers = list(_DEFAULT_SPEAKERS[language].keys())
    speakers.insert(0, "None") 
    return speakers

def change_interface_language(language):
    """Change interface language and update available speakers."""
    try:
        interface.change_language(language)
        speakers = get_available_speakers(language)
        return gr.update(choices=speakers, value="male_1"), gr.update(visible=True)
    except ValueError as e:
        return gr.update(choices=["None"], value="None"), gr.update(visible=False)

def generate_tts(
        text, temperature, repetition_penalty, language, 
        speaker_selection, reference_audio, reference_text
    ):
    """Generate TTS with error handling and new features."""
    try:
        # Validate inputs for custom speaker
        if reference_audio and reference_text:
            if not os.path.exists(reference_audio):
                raise ValueError("Reference audio file not found")
            if not reference_text.strip():
                raise ValueError("Reference transcription text is required")
            speaker = interface.create_speaker(reference_audio, reference_text)

        # Use selected default speaker
        elif speaker_selection and speaker_selection != "None":
            speaker = interface.load_default_speaker(speaker_selection)

        # No speaker - random characteristics
        else:
            speaker = None

        # Generate audio
        output = interface.generate(
            text=text,
            speaker=speaker,
            temperature=temperature,
            repetition_penalty=repetition_penalty,
            max_length=4096
        )

        # Verify output
        if output.audio is None:
            raise ValueError("Model failed to generate audio. This may be due to input length constraints or early EOS token.")

        # Save and return output
        output_path = "output.wav"
        output.save(output_path)
        return output_path, None

    except Exception as e:
        return None, str(e)

with gr.Blocks() as demo:
    gr.Markdown("# OuteTTS-0.2-500M Text-to-Speech Demo")

    error_box = gr.Textbox(label="Error Messages", visible=False)

    with gr.Row():
        with gr.Column():
            # Language selection
            language_dropdown = gr.Dropdown(
                choices=list(interface.languages),
                value="en",
                label="Interface Language"
            )

            # Speaker selection
            speaker_dropdown = gr.Dropdown(
                choices=get_available_speakers("en"),
                value="male_1",
                label="Speaker Selection"
            )

            text_input = gr.Textbox(
                label="Text to Synthesize",
                placeholder="Enter text here..."
            )

            temperature = gr.Slider(
                0.1, 1.0,
                value=0.1,
                label="Temperature (lower = more stable tone, higher = more expressive)"
            )

            repetition_penalty = gr.Slider(
                0.5, 2.0,
                value=1.1,
                label="Repetition Penalty"
            )

            gr.Markdown("""
### Voice Cloning Guidelines:
- Use 10-15 seconds of clear, noise-free audio
- Provide accurate transcription
- Longer audio clips will reduce maximum output length
- Custom speaker overrides speaker selection
            """)

            reference_audio = gr.Audio(
                label="Reference Audio (for voice cloning)",
                type="filepath"
            )

            reference_text = gr.Textbox(
                label="Reference Transcription Text",
                placeholder="Enter exact transcription of reference audio"
            )

            submit_button = gr.Button("Generate Speech")

        with gr.Column():
            audio_output = gr.Audio(
                label="Generated Audio",
                type="filepath"
            )

    language_dropdown.change(
        fn=change_interface_language,
        inputs=[language_dropdown],
        outputs=[speaker_dropdown, speaker_dropdown]
    )

    submit_button.click(
        fn=generate_tts,
        inputs=[
            text_input,
            temperature,
            repetition_penalty,
            language_dropdown,
            speaker_dropdown,
            reference_audio,
            reference_text
        ],
        outputs=[audio_output, error_box]
    ).then(
        fn=lambda x: gr.update(visible=bool(x)),
        inputs=[error_box],
        outputs=[error_box]
    )

demo.launch()