mageec commited on
Commit
d1e96e8
·
verified ·
1 Parent(s): a89489e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +83 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+
4
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
5
+ from transformers.models.whisper.tokenization_whisper import LANGUAGES
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
+
8
+ import gradio as gr
9
+
10
+
11
+ model_id = "mageec/whisper-tiny-hi-capstone"
12
+
13
+ device = "cuda" if torch.cuda.is_available() else "cpu"
14
+
15
+ processor = WhisperProcessor.from_pretrained(model_id)
16
+ model = WhisperForConditionalGeneration.from_pretrained(model_id)
17
+ model.eval()
18
+ model.to(device)
19
+
20
+ sampling_rate = processor.feature_extractor.sampling_rate
21
+
22
+ bos_token_id = processor.tokenizer.all_special_ids[-106]
23
+ decoder_input_ids = torch.tensor([1,bos_token_id]).to(device)
24
+
25
+
26
+ def process_audio_file(file):
27
+ with open(file, "rb") as f:
28
+ inputs = f.read()
29
+
30
+ audio = ffmpeg_read(inputs, sampling_rate)
31
+ return audio
32
+
33
+
34
+ def transcribe(Microphone, File_Upload):
35
+ warn_output = ""
36
+ if (Microphone is not None) and (File_Upload is not None):
37
+ warn_output = "WARNING: You've uploaded an audio file and used the microphone. " \
38
+ "The recorded file from the microphone will be used and the uploaded audio will be discarded.\n"
39
+ file = Microphone
40
+
41
+ elif (Microphone is None) and (File_Upload is None):
42
+ return "ERROR: You have to either use the microphone or upload an audio file"
43
+
44
+ elif Microphone is not None:
45
+ file = Microphone
46
+ else:
47
+ file = File_Upload
48
+
49
+ audio_data = process_audio_file(file)
50
+
51
+ input_features = processor(audio_data, return_tensors="pt").input_features
52
+
53
+ with torch.no_grad():
54
+ logits = model.forward(input_features.to(device), decoder_input_ids=decoder_input_ids).logits
55
+
56
+ pred_ids = torch.argmax(logits, dim=-1)
57
+ probability = F.softmax(logits, dim=-1).max()
58
+
59
+ lang_ids = processor.decode(pred_ids[0])
60
+
61
+ lang_ids = lang_ids.lstrip("<|").rstrip("|>")
62
+ language = LANGUAGES.get(lang_ids, "not detected")
63
+
64
+ return language.capitalize(), probability.cpu().numpy()
65
+
66
+
67
+ iface = gr.Interface(
68
+ fn=transcribe,
69
+ inputs=[
70
+ gr.inputs.Audio(source="microphone", type='filepath', optional=True),
71
+ gr.inputs.Audio(source="upload", type='filepath', optional=True),
72
+ ],
73
+ outputs=[
74
+ gr.outputs.Textbox(label="Language"),
75
+ gr.Number(label="Probability"),
76
+ ],
77
+ layout="horizontal",
78
+ theme="huggingface",
79
+ title="Whisper Language Identification",
80
+ description="Demo for Language Identification using OpenAI's [Whisper Large V2](https://huggingface.co/openai/whisper-large-v2).",
81
+ allow_flagging='never',
82
+ )
83
+ iface.launch(enable_queue=True)