TedCasSpeechRecognitionMulti

Runtime error

File size: 3,369 Bytes

510bcda
 
 
736ba09
510bcda
2a0c00f
510bcda
2a0c00f
 
736ba09
 
 
 
 
2a0c00f
d887117
 
 
2a0c00f
 
d887117
 
 
2a0c00f
 
d887117
2a0c00f
 
 
 
 
 
d887117
 
 
 
 
510bcda
 
6b3c614
 
 
 
 
 
 
d887117
510bcda
 
2a0c00f
510bcda
8a70277
 
2a0c00f
 
 
 
 
 
 
 
 
 
 
 
 
510bcda
 
 
 
 
 
 
 
 
 
 
2a0c00f
2704e98
2a0c00f
510bcda

from transformers import pipeline
import gradio as gr
import time
import whisper

#p = pipeline("automatic-speech-recognition",model="jonatasgrosman/wav2vec2-large-xlsr-53-spanish")

from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset
#processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
#model.config.forced_decoder_ids = None
model = whisper.load_model("large")

# load model and processor
#processor = WhisperProcessor.from_pretrained("openai/whisper-large-v2")
#model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large-v2")
#model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
#ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
#sample = ds[0]["audio"]
#input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
#predicted_ids = model.generate(input_features)
# decode token ids to text
#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)
#['<|startoftranscript|><|en|><|transcribe|><|notimestamps|> Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.<|endoftext|>']

#transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
#[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']
#def speech_to_text(tmp_filename, model_size):
#    model = whisper.load_model(model_size)
#    result = model.transcribe(tmp_filename)
#
 #   return result["text"]


#gr.Interface(
#    fn=speech_to_text,
#    inputs=[
#        gr.Audio(source="microphone", type="filepath"),
#        gr.Dropdown(choices=["tiny", "base", "small", "medium", "large"]),
#        ],
#    outputs="text").launch()

def transcribe(language,audio, state=""):#language="Spanish",
    time.sleep(1)
    if language=="Multi":
        state=""
        result = model.transcribe(audio)
        text = result["text"]#processor.batch_decode(predicted_ids, skip_special_tokens=False)

#    if language=="Catalan":
#        state=""
#        text = pc(audio)["text"]
#    if language=="English":
#        state=""
#        text = pe(audio)["text"]
#    if language=="French":
#        state=""
#        text = pf(audio)["text"]
#    if language=="Japanese":
#        state=""
#        text = pj(audio)["text"]
    state += text + " "
    #text2="Esto es loq ue te he entendido"
    return state, state

demo=gr.Interface(
    fn=transcribe, 
    
    title="TEDCAS Offline Speech recognition",
    description="1)Select language 2)Click on 'record from microphone' and talk 3)Click on 'stop recording' 4)Click on submit 5)Before starting again, click on 'clear'",

    inputs=[
        #gr.Dropdown(["Spanish","Catalan","English", "French", "Japanese"],value="Spanish"),
        gr.Dropdown(["Multi","Spanish"],value="Multi"),
        
        #gr.Audio(source="microphone", type="filepath", streaming=True), 
        gr.inputs.Audio(source="microphone", type="filepath"), 
        "state"#,"language"
    ],
    outputs=[
        "textbox",
        "state"
    ],
    #live=True).launch()
)
demo.launch()
#demo.launch(auth=("TedCas", "Kike1234"))