from scipy.io.wavfile import write as write_wav from transformers import AutoProcessor, BarkModel import gradio processor = AutoProcessor.from_pretrained("suno/bark-small") model = BarkModel.from_pretrained("suno/bark-small") def generate_speech(text, voice_preset="v2/en_speaker_6"): inputs = processor(text, voice_preset=voice_preset) audio_array = model.generate(**inputs) audio_array = audio_array.cpu().numpy().squeeze() # save audio to disk, but first take the sample rate from the model config sample_rate = model.generation_config.sample_rate write_wav("bark_generation.wav", sample_rate, audio_array) return "bark_generation.wav" # generate_speech("Hello uh ... [clears throat], my dog is cute [laughter]") iface = gradio.Interface(fn = generate_speech, inputs = 'text', outputs = 'audio', title = 'Text to Speech' ) iface.launch(share=True)