ronniet commited on
Commit
be0f3ee
·
1 Parent(s): 37fe72d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -9
app.py CHANGED
@@ -7,13 +7,17 @@ import numpy as np
7
  import torch
8
 
9
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
10
 
11
 
12
  checkpoint = "microsoft/speecht5_tts"
13
- processor = SpeechT5Processor.from_pretrained(checkpoint)
14
- model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
15
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
16
 
 
 
 
17
  def tts(text):
18
  if len(text.strip()) == 0:
19
  return (16000, np.zeros(0).astype(np.int16))
@@ -22,7 +26,7 @@ def tts(text):
22
 
23
  # limit input length
24
  input_ids = inputs["input_ids"]
25
- input_ids = input_ids[..., :model.config.max_text_positions]
26
 
27
  # if speaker == "Surprise Me!":
28
  # # load one of the provided speaker embeddings at random
@@ -44,21 +48,26 @@ def tts(text):
44
 
45
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
46
 
47
- speech = model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
48
 
49
  speech = (speech.numpy() * 32767).astype(np.int16)
50
  return (16000, speech)
51
 
52
 
53
- captioner = pipeline(model="microsoft/git-base")
54
  # tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
55
 
56
 
57
  def predict(image):
58
- text = captioner(image)[0]["generated_text"]
59
 
60
  # audio_output = "output.wav"
61
  # tts.tts_to_file(text, speaker=tts.speakers[0], language="en", file_path=audio_output)
 
 
 
 
 
62
  audio = tts(text)
63
 
64
  return text, audio
@@ -74,6 +83,3 @@ demo = gr.Interface(
74
  )
75
 
76
  demo.launch()
77
-
78
- # gr.Interface.load("models/ronniet/git-base-env").launch()
79
- # gr.Interface.load("models/microsoft/git-base").launch()
 
7
  import torch
8
 
9
  from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
10
+ from transformers import AutoProcessor, AutoModelForCausalLM
11
 
12
 
13
  checkpoint = "microsoft/speecht5_tts"
14
+ tts_processor = SpeechT5Processor.from_pretrained(checkpoint)
15
+ tts_model = SpeechT5ForTextToSpeech.from_pretrained(checkpoint)
16
  vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
17
 
18
+ ic_processor = AutoProcessor.from_pretrained("microsoft/git-base")
19
+ ic_model = AutoModelForCausalLM.from_pretrained("microsoft/git-base")
20
+
21
  def tts(text):
22
  if len(text.strip()) == 0:
23
  return (16000, np.zeros(0).astype(np.int16))
 
26
 
27
  # limit input length
28
  input_ids = inputs["input_ids"]
29
+ input_ids = input_ids[..., :tts_model.config.max_text_positions]
30
 
31
  # if speaker == "Surprise Me!":
32
  # # load one of the provided speaker embeddings at random
 
48
 
49
  speaker_embedding = torch.tensor(speaker_embedding).unsqueeze(0)
50
 
51
+ speech = tts_model.generate_speech(input_ids, speaker_embedding, vocoder=vocoder)
52
 
53
  speech = (speech.numpy() * 32767).astype(np.int16)
54
  return (16000, speech)
55
 
56
 
57
+ # captioner = pipeline(model="microsoft/git-base")
58
  # tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False, gpu=False)
59
 
60
 
61
  def predict(image):
62
+ # text = captioner(image)[0]["generated_text"]
63
 
64
  # audio_output = "output.wav"
65
  # tts.tts_to_file(text, speaker=tts.speakers[0], language="en", file_path=audio_output)
66
+
67
+ pixel_values = ic_processor(images=image, return_tensors="pt").pixel_values
68
+ text_ids = ic_model.generate(pixel_values=pixel_values, max_length=50)
69
+ text = ic_processor.batch_decode(text_ids, skip_special_tokens=True)[0]
70
+
71
  audio = tts(text)
72
 
73
  return text, audio
 
83
  )
84
 
85
  demo.launch()