JPLTedCas commited on
Commit
2a0c00f
·
1 Parent(s): 510bcda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -20
app.py CHANGED
@@ -2,32 +2,49 @@ from transformers import pipeline
2
  import gradio as gr
3
  import time
4
 
5
- p = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
6
- pc = pipeline("automatic-speech-recognition",model="softcatala/wav2vec2-large-xlsr-catala")
7
- pe = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-english")
8
- pj = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-japanese")
9
- pf = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-french")
10
 
 
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
 
14
  def transcribe(language,audio, state=""):#language="Spanish",
15
  time.sleep(1)
16
- if language=="Spanish":
17
- state=""
18
- text = p(audio)["text"]
19
- if language=="Catalan":
20
  state=""
21
- text = pc(audio)["text"]
22
- if language=="English":
23
- state=""
24
- text = pe(audio)["text"]
25
- if language=="French":
26
- state=""
27
- text = pf(audio)["text"]
28
- if language=="Japanese":
29
- state=""
30
- text = pj(audio)["text"]
 
 
 
 
31
  state += text + " "
32
  #text2="Esto es loq ue te he entendido"
33
  return state, state
@@ -39,7 +56,9 @@ demo=gr.Interface(
39
  description="1)Select language 2)Click on 'record from microphone' and talk 3)Click on 'stop recording' 4)Click on submit 5)Before starting again, click on 'clear'",
40
 
41
  inputs=[
42
- gr.Dropdown(["Spanish","Catalan","English", "French", "Japanese"],value="Spanish"),
 
 
43
  #gr.Audio(source="microphone", type="filepath", streaming=True),
44
  gr.inputs.Audio(source="microphone", type="filepath"),
45
  "state"#,"language"
 
2
  import gradio as gr
3
  import time
4
 
5
+ #p = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")
 
 
 
 
6
 
7
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
8
+ from datasets import load_dataset
9
 
10
+ # load model and processor
11
+ processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
12
+ model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
13
+ model.config.forced_decoder_ids = None
14
+
15
+ # load dummy dataset and read audio files
16
+ ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
17
+ sample = ds[0]["audio"]
18
+ input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features
19
+
20
+ # generate token ids
21
+ predicted_ids = model.generate(input_features)
22
+ # decode token ids to text
23
+ #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
24
+ #['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']
25
+
26
+ #transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
27
+ #[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
28
 
29
 
30
  def transcribe(language,audio, state=""):#language="Spanish",
31
  time.sleep(1)
32
+ if language=="Multi":
 
 
 
33
  state=""
34
+ text = processor.batch_decode(predicted_ids, skip_special_tokens=False)
35
+
36
+ # if language=="Catalan":
37
+ # state=""
38
+ # text = pc(audio)["text"]
39
+ # if language=="English":
40
+ # state=""
41
+ # text = pe(audio)["text"]
42
+ # if language=="French":
43
+ # state=""
44
+ # text = pf(audio)["text"]
45
+ # if language=="Japanese":
46
+ # state=""
47
+ # text = pj(audio)["text"]
48
  state += text + " "
49
  #text2="Esto es loq ue te he entendido"
50
  return state, state
 
56
  description="1)Select language 2)Click on 'record from microphone' and talk 3)Click on 'stop recording' 4)Click on submit 5)Before starting again, click on 'clear'",
57
 
58
  inputs=[
59
+ #gr.Dropdown(["Spanish","Catalan","English", "French", "Japanese"],value="Spanish"),
60
+ gr.Dropdown(["Multi"],value="Multi"),
61
+
62
  #gr.Audio(source="microphone", type="filepath", streaming=True),
63
  gr.inputs.Audio(source="microphone", type="filepath"),
64
  "state"#,"language"