Spaces:

ronniet
/

tts

Sleeping

App Files Files Community

ronniet commited on Oct 10, 2023

Commit

f45d8c1

1 Parent(s): 8ddbf4a

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -1

app.py CHANGED Viewed

@@ -28,4 +28,47 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
 checkpoint = "microsoft/speecht5_tts"
 processor = SpeechT5Processor.from_pretrained(checkpoint)
 model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
-vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

 checkpoint = "microsoft/speecht5_tts"
 processor = SpeechT5Processor.from_pretrained(checkpoint)
 model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
+vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+def predict(text):
+    if len(text.strip()) == 0:
+        return (16000, np.zeros(0).astype(np.int16))
+    inputs = processor(text=text, return_tensors="pt")
+    # limit input length
+    input_ids = inputs["input_ids"]
+    input_ids = input_ids[..., :model.config.max_text_positions]
+    # if speaker == "Surprise Me!":
+    #     # load one of the provided speaker embeddings at random
+    #     idx = np.random.randint(len(speaker_embeddings))
+    #     key = list(speaker_embeddings.keys())[idx]
+    #     speaker_embedding = np.load(speaker_embeddings[key])
+    #     # randomly shuffle the elements
+    #     np.random.shuffle(speaker_embedding)
+    #     # randomly flip half the values
+    #     x = (np.random.rand(512) >= 0.5) * 1.0
+    #     x[x == 0] = -1.0
+    #     speaker_embedding *= x
+        #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
+    # else:
+    speaker_embedding = np.load("cmu_us_bdl_arctic-wav-arctic_a0009.npy")
+    speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
+    speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
+    speech = (speech.numpy() * 32767).astype(np.int16)
+    return (16000, speech)
+demo = gr.Interface(
+    fn = predict,
+    inputs="text",
+    outputs=gr.Audio(type="numpy")
+)
+demo.launch()