sanchit-gandhi commited on
Commit
d3e0df2
·
1 Parent(s): f9dc7b0

use byte64 encoding for faster file transfer

Browse files
Files changed (1) hide show
  1. app.py +11 -4
app.py CHANGED
@@ -1,6 +1,9 @@
 
 
1
  import gradio as gr
2
  import requests
3
  from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
 
4
 
5
 
6
  title = "Whisper JAX: The Fastest Whisper API ⚡️"
@@ -10,7 +13,7 @@ description = "Whisper JAX is an optimised implementation of the [Whisper model]
10
 
11
  API_URL = "https://whisper-jax.ngrok.io/generate/"
12
 
13
- article = "Whisper large-v2 model by OpenAI. Backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme."
14
 
15
  language_names = sorted(TO_LANGUAGE_CODE.keys())
16
  SAMPLING_RATE = 16000
@@ -56,7 +59,11 @@ def transcribe_audio(microphone, file_upload, task, return_timestamps):
56
 
57
  inputs = microphone if microphone is not None else file_upload
58
 
59
- inputs = {"array": inputs[1].tolist(), "sampling_rate": inputs[0]}
 
 
 
 
60
 
61
  text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
62
 
@@ -83,8 +90,8 @@ def transcribe_youtube(yt_url, task, return_timestamps):
83
  audio = gr.Interface(
84
  fn=transcribe_audio,
85
  inputs=[
86
- gr.inputs.Audio(source="microphone", optional=True),
87
- gr.inputs.Audio(source="upload", optional=True),
88
  gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
89
  gr.inputs.Checkbox(default=False, label="Return timestamps"),
90
  ],
 
1
+ import base64
2
+
3
  import gradio as gr
4
  import requests
5
  from transformers.models.whisper.tokenization_whisper import TO_LANGUAGE_CODE
6
+ from transformers.pipelines.audio_utils import ffmpeg_read
7
 
8
 
9
  title = "Whisper JAX: The Fastest Whisper API ⚡️"
 
13
 
14
  API_URL = "https://whisper-jax.ngrok.io/generate/"
15
 
16
+ article = "Whisper large-v2 model by OpenAI. Backend running JAX on a TPU v4-8 through the generous support of the [TRC](https://sites.research.google/trc/about/) programme. Whisper JAX code and Gradio demo by 🤗 Hugging Face."
17
 
18
  language_names = sorted(TO_LANGUAGE_CODE.keys())
19
  SAMPLING_RATE = 16000
 
59
 
60
  inputs = microphone if microphone is not None else file_upload
61
 
62
+ with open(inputs, "rb") as f:
63
+ inputs = f.read()
64
+
65
+ inputs = ffmpeg_read(inputs, SAMPLING_RATE)
66
+ inputs = {"array": base64.b64encode(inputs.tobytes()), "sampling_rate": SAMPLING_RATE}
67
 
68
  text, timestamps = inference(inputs=inputs, task=task, return_timestamps=return_timestamps)
69
 
 
90
  audio = gr.Interface(
91
  fn=transcribe_audio,
92
  inputs=[
93
+ gr.inputs.Audio(source="microphone", optional=True, type="filepath"),
94
+ gr.inputs.Audio(source="upload", optional=True, type="filepath"),
95
  gr.inputs.Radio(["transcribe", "translate"], label="Task", default="transcribe"),
96
  gr.inputs.Checkbox(default=False, label="Return timestamps"),
97
  ],