import tempfile import numpy as np from scipy.io.wavfile import write import gradio as gr from transformers import VitsTokenizer, VitsModel, set_seed, pipeline class CustomFlagging(gr.FlaggingCallback): def setup(self, *args, **kwargs): pass # Optional setup steps def flag(self, flag_data, flag_option=None, username=None): print(f"Аудио: {flag_data}, Сообщение: {flag_option}") # Custom options flagging_callback = CustomFlagging() flagging_options = ["Хорошая озвучка", "Слышен механический треск", "Не совпадает произношение букв", 'Проглочены буквы'] # Customize options # Load your fine-tuned model model_name = "leks-forever/vits_lez_tts" # Replace with your Hugging Face model name tokenizer = VitsTokenizer.from_pretrained(model_name) model = VitsModel.from_pretrained(model_name) tts_pipeline = pipeline("text-to-speech", model=model_name) def tts_function(input_text): inputs = tokenizer(text=input_text, return_tensors="pt") speech = tts_pipeline(input_text) set_seed(900) # make speech faster and more noisy model.speaking_rate = 0.9 model.noise_scale = 0 sampling_rate = speech["sampling_rate"] outputs = model(**inputs) waveform = outputs.waveform[0] waveform = waveform.detach().cpu().float().numpy() with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmpfile: write(tmpfile.name, rate=sampling_rate, data=waveform) return tmpfile.name # Return the filepath interface = gr.Interface( fn=tts_function, inputs=gr.Textbox(label="Введите текст на лезгинском"), outputs=gr.Audio(label="Аудио"), title="Text-to-speech Лезги ЧIалал", flagging_mode="auto", # Enable the flagging button ) # Launch the app interface.launch()