ronniet commited on
Commit
f45d8c1
·
1 Parent(s): 8ddbf4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +44 -1
app.py CHANGED
@@ -28,4 +28,47 @@ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5Hif
28
  checkpoint = "microsoft/speecht5_tts"
29
  processor = SpeechT5Processor.from_pretrained(checkpoint)
30
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
31
- vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  checkpoint = "microsoft/speecht5_tts"
29
  processor = SpeechT5Processor.from_pretrained(checkpoint)
30
  model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
31
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
32
+
33
+ def predict(text):
34
+ if len(text.strip()) == 0:
35
+ return (16000, np.zeros(0).astype(np.int16))
36
+
37
+ inputs = processor(text=text, return_tensors="pt")
38
+
39
+ # limit input length
40
+ input_ids = inputs["input_ids"]
41
+ input_ids = input_ids[..., :model.config.max_text_positions]
42
+
43
+ # if speaker == "Surprise Me!":
44
+ # # load one of the provided speaker embeddings at random
45
+ # idx = np.random.randint(len(speaker_embeddings))
46
+ # key = list(speaker_embeddings.keys())[idx]
47
+ # speaker_embedding = np.load(speaker_embeddings[key])
48
+
49
+ # # randomly shuffle the elements
50
+ # np.random.shuffle(speaker_embedding)
51
+
52
+ # # randomly flip half the values
53
+ # x = (np.random.rand(512) >= 0.5) * 1.0
54
+ # x[x == 0] = -1.0
55
+ # speaker_embedding *= x
56
+
57
+ #speaker_embedding = np.random.rand(512).astype(np.float32) * 0.3 - 0.15
58
+ # else:
59
+ speaker_embedding = np.load("cmu_us_bdl_arctic-wav-arctic_a0009.npy")
60
+
61
+ speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
62
+
63
+ speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
64
+
65
+ speech = (speech.numpy() * 32767).astype(np.int16)
66
+ return (16000, speech)
67
+
68
+ demo = gr.Interface(
69
+ fn = predict,
70
+ inputs="text",
71
+ outputs=gr.Audio(type="numpy")
72
+ )
73
+
74
+ demo.launch()