Spaces:

LuisG07
/

ASR-Spanish

Sleeping

App Files Files Community

LuisG07 commited on May 8, 2022

Commit

577c3f8

1 Parent(s): 131a172

Add application file

Browse files

Files changed (3) hide show

app.py +69 -0
packages.txt +1 -0
requirements.txt +6 -0

app.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import nltk
+import librosa
+import torch
+import kenlm
+import gradio as gr
+from pyctcdecode import build_ctcdecoder
+from transformers import Wav2Vec2Processor,Wav2Vec2ProcessorWithLM,Wav2Vec2ForCTC
+nltk.download("punkt")
+def return_processor_and_model(model_name):
+    return Wav2Vec2Processor.from_pretrained(model_name), Wav2Vec2ForCTC.from_pretrained(model_name)
+def return_processor_and_modelWithLM(model_name):
+    return Wav2Vec2ProcessorWithLM.from_pretrained(model_name), Wav2Vec2ForCTC.from_pretrained(model_name)
+def load_and_fix_data(input_file):
+  speech, sample_rate = librosa.load(input_file)
+  if len(speech.shape) > 1:
+      speech = speech[:,0] + speech[:,1]
+  if sample_rate !=16000:
+    speech = librosa.resample(speech, sample_rate,16000)
+  return speech
+def fix_transcription_casing(input_sentence):
+  sentences = nltk.sent_tokenize(input_sentence)
+  return (' '.join([s.replace(s[0],s[0].capitalize(),1) for s in sentences]))
+def predict_and_ctc_lm_decode(input_file, model_name):
+  processor, model = return_processor_and_modelWithLM(model_name)
+  speech = load_and_fix_data(input_file)
+  input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
+  logits = model(input_values).logits.cpu().detach().numpy()[0]
+  pred = processor.batch_decode(logits.numpy()).text
+  transcribed_text = fix_transcription_casing(pred[0].lower())
+  return transcribed_text
+def predict_and_greedy_decode(input_file, model_name):
+  processor, model = return_processor_and_model(model_name)
+  speech = load_and_fix_data(input_file)
+  input_values = processor(speech, return_tensors="pt", sampling_rate=16000).input_values
+  logits = model(input_values).logits
+  predicted_ids = torch.argmax(logits, dim=-1)
+  pred = processor.batch_decode(predicted_ids)
+  transcribed_text = fix_transcription_casing(pred[0].lower())
+  return transcribed_text
+def return_all_predictions(input_file, model_name):
+  return predict_and_ctc_lm_decode(input_file, model_name), predict_and_greedy_decode(input_file, model_name)
+gr.Interface(return_all_predictions,
+             inputs = [gr.inputs.Audio(source="microphone", type="filepath", label="Record/ Drop audio"), gr.inputs.Dropdown(["jonatasgrosman/wav2vec2-large-xlsr-53-spanish", "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"], label="Model Name")],
+             outputs = [gr.outputs.Textbox(label="Beam CTC decoding w/ LM"), gr.outputs.Textbox(label="Greedy decoding")],
+             title="ASR using Wav2Vec2 & pyctcdecode in spanish",
+             description = "Comparing greedy decoder with beam search CTC decoder, record/ drop your audio!",
+             layout = "horizontal",
+             examples = [["test1.wav", "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"], ["test2.wav", "jonatasgrosman/wav2vec2-large-xlsr-53-spanish"]],
+             theme="huggingface",
+             enable_queue=True).launch()

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ libsndfile1

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+nltk
+transformers
+torch
+librosa
+pyctcdecode
+pypi-kenlm