Moore-Language-Space-ZeroGPU

Runtime error

App Files Files Community

ANYANTUDRE commited on Oct 22, 2024

Commit

21f74cc

1 Parent(s): 78d1101

refactor code

Browse files

Files changed (11) hide show

app.py +89 -85
app_2.py +0 -140
goai_helpers/__init__.py +0 -0
goai_stt.py → goai_helpers/goai_stt.py +0 -0
goai_helpers/goai_stt2.py +38 -0
goai_traduction.py → goai_helpers/goai_traduction.py +0 -0
goai_tts.py → goai_helpers/goai_tts.py +3 -1
goai_tts2.py → goai_helpers/goai_tts2.py +1 -1
utils.py → goai_helpers/utils.py +0 -0
goai_stt2.py +0 -32
whisper_notebook.ipynb +0 -192

app.py CHANGED Viewed

@@ -1,108 +1,65 @@
 import spaces
 import torch
 import gradio as gr
-from transformers import pipeline
-from transformers.pipelines.audio_utils import ffmpeg_read
-import tempfile
 import os
-import time
-import requests
 from languages import get_language_names
-from subtitle import text_output, subtitle_output
-from huggingface_hub import login
 auth_token = os.getenv('HF_SPACE_TOKEN')
 login(token=auth_token)
-try:
-    import spaces
-    USING_SPACES = True
-except ImportError:
-    USING_SPACES = False
-def gpu_decorator(func):
-    if USING_SPACES:
-        return spaces.GPU(func)
-    else:
-        return func
-device = 0 if torch.cuda.is_available() else "cpu"
-@gpu_decorator
-def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s):
-    if inputs is None:
-        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    pipe = pipeline(
-        task="automatic-speech-recognition",
-        model=model,
-        chunk_length_s=chunk_length_s,
-        stride_length_s=stride_length_s,
-        device=device,
-    )
-    # Whisper's full language ID mapping
-    lang_to_id = {
-        "en": 0, "zh": 1, "de": 2, "es": 3, "ru": 4, "ko": 5, "fr": 6, "ja": 7,
-        "pt": 8, "tr": 9, "pl": 10, "ca": 11, "nl": 12, "ar": 13, "sv": 14,
-        "it": 15, "id": 16, "hi": 17, "fi": 18, "vi": 19, "he": 20, "uk": 21,
-        "el": 22, "ms": 23, "cs": 24, "ro": 25, "da": 26, "hu": 27, "ta": 28,
-        "no": 29, "th": 30, "ur": 31, "hr": 32, "bg": 33, "lt": 34, "la": 35,
-        "mi": 36, "ml": 37, "cy": 38, "sk": 39, "te": 40, "fa": 41, "lv": 42,
-        "bn": 43, "sr": 44, "az": 45, "sl": 46, "kn": 47, "et": 48, "mk": 49,
-        "br": 50, "eu": 51, "is": 52, "hy": 53, "ne": 54, "mn": 55, "bs": 56,
-        "kk": 57, "sq": 58, "sw": 59, "gl": 60, "mr": 61, "pa": 62, "si": 63,
-        "km": 64, "sn": 65, "yo": 66, "so": 67, "af": 68, "oc": 69, "ka": 70,
-        "be": 71, "tg": 72, "sd": 73, "gu": 74, "am": 75, "yi": 76, "lo": 77,
-        "uz": 78, "fo": 79, "ht": 80, "ps": 81, "tk": 82, "nn": 83, "mt": 84,
-        "sa": 85, "lb": 86, "my": 87, "bo": 88, "tl": 89, "mg": 90, "as": 91,
-        "tt": 92, "haw": 93, "ln": 94, "ha": 95, "ba": 96, "jw": 97, "su": 98
-    }
-    forced_decoder_ids = None
-    if model.endswith(".en") == False and language in lang_to_id:
-        forced_decoder_ids = [[2, lang_to_id[language]]]  # Setting forced decoder for language
-    generate_kwargs = {}
-    if forced_decoder_ids:
-        generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
-    #if model.endswith(".en") == False:
-        #generate_kwargs["task"] = task
-    output = pipe(inputs, batch_size=batch_size, **generate_kwargs)
-    transcription_text = output['text']
-    transcription_file_path = "transcription.txt"
-    with open(transcription_file_path, "w") as f:
-        f.write(transcription_text)
-    return transcription_text, transcription_file_path
 demo = gr.Blocks(theme=gr.themes.Ocean())
-mf_transcribe = gr.Interface(
-    fn=transcribe,
     inputs=[
         gr.Audio(sources=["microphone", "upload"], type="filepath"),
         gr.Dropdown(
             choices=[
                 "ArissBandoss/whisper-small-mos",
-                #"openai/whisper-tiny",
-                #"openai/whisper-base",
-                #"openai/whisper-small",
-                #"openai/whisper-medium",
-                "openai/whisper-large",
-                #"openai/whisper-large-v1",
-                #"openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
             ],
             value="ArissBandoss/whisper-small-mos",
             label="Model Name"
@@ -113,15 +70,62 @@ mf_transcribe = gr.Interface(
         gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
     ],
     outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
-    title="Whisper Large V3 Turbo: Transcribe Audio",
-    description=("Transcribe long-form microphone or audio inputs with the click of a button!"),
     flagging_mode="auto",
 )
 with demo:
     gr.TabbedInterface(
-        interface_list=[mf_transcribe],
         tab_names=["Microphone & Audio file"]
     )

 import spaces
 import torch
+import scipy
+import torchaudio
 import gradio as gr
+from transformers import pipeline, set_seed
+from huggingface_hub import login
 import os
 from languages import get_language_names
+from goai_helpers import goai_traduction, goai_stt, goai_stt2, goai_tts,  goai_tts2
 auth_token = os.getenv('HF_SPACE_TOKEN')
 login(token=auth_token)
+# list all files in the ./audios directory for the dropdown
+AUDIO_FILES = [f for f in os.listdir('./exples_voix') if os.path.isfile(os.path.join('./exples_voix', f))]
+DESCRIPTION = """<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap;">
+                    <div style="flex: 1; min-width: 250px;">
+                        Ce modèle de traduction vers la <b>langue Mooré</b> a été développé from scratch par <b>GO AI CORP</b> et la version disponible en test est celle à 700 millions de paramètres.
+                        <br><br>
+                        Pour les détails techniques sur l'architecture du modèle, prendre attache avec nous via WhatsApp au <b>+226 66 62 83 03</b>.
+                    </div>
+                    <div style="flex-shrink: 0; min-width: 150px; text-align: center;">
+                        <img src="https://github.com/ANYANTUDRE/Stage-IA-Selever-GO-AI-Corp/blob/main/img/goaicorp-logo2.jpg?raw=true" width="300px" style="max-width: 100%; height: auto;">
+                    </div>
+                </div>
+                """
+# Whisper's full language ID mapping
+LANG_TO_ID = {
+    "en": 0, "zh": 1, "de": 2, "es": 3, "ru": 4, "ko": 5, "fr": 6, "ja": 7,
+    "pt": 8, "tr": 9, "pl": 10, "ca": 11, "nl": 12, "ar": 13, "sv": 14,
+    "it": 15, "id": 16, "hi": 17, "fi": 18, "vi": 19, "he": 20, "uk": 21,
+    "el": 22, "ms": 23, "cs": 24, "ro": 25, "da": 26, "hu": 27, "ta": 28,
+    "no": 29, "th": 30, "ur": 31, "hr": 32, "bg": 33, "lt": 34, "la": 35,
+    "mi": 36, "ml": 37, "cy": 38, "sk": 39, "te": 40, "fa": 41, "lv": 42,
+    "bn": 43, "sr": 44, "az": 45, "sl": 46, "kn": 47, "et": 48, "mk": 49,
+    "br": 50, "eu": 51, "is": 52, "hy": 53, "ne": 54, "mn": 55, "bs": 56,
+    "kk": 57, "sq": 58, "sw": 59, "gl": 60, "mr": 61, "pa": 62, "si": 63,
+    "km": 64, "sn": 65, "yo": 66, "so": 67, "af": 68, "oc": 69, "ka": 70,
+    "be": 71, "tg": 72, "sd": 73, "gu": 74, "am": 75, "yi": 76, "lo": 77,
+    "uz": 78, "fo": 79, "ht": 80, "ps": 81, "tk": 82, "nn": 83, "mt": 84,
+    "sa": 85, "lb": 86, "my": 87, "bo": 88, "tl": 89, "mg": 90, "as": 91,
+    "tt": 92, "haw": 93, "ln": 94, "ha": 95, "ba": 96, "jw": 97, "su": 98
+}
 demo = gr.Blocks(theme=gr.themes.Ocean())
+goai_stt = gr.Interface(
+    fn=goai_stt2.goai_stt2
     inputs=[
         gr.Audio(sources=["microphone", "upload"], type="filepath"),
         gr.Dropdown(
             choices=[
                 "ArissBandoss/whisper-small-mos",
+                "openai/whisper-large-v3-turbo",
             ],
             value="ArissBandoss/whisper-small-mos",
             label="Model Name"
         gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
     ],
     outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
+    examples=[["./audios/example1.mp3", "a ye ligdi"],
+              ["./audios/example2.mp3", "zoe nimbãanega"],
+              ["./audios/example3.mp3", "zãng-zãnga"],
+              ["./audios/example4.mp3", "yõk foto"]
+             ],
+    cache_examples=False,
+    title="Mooré ASR: Transcribe Audio",
+    description=DESCRIPTION,
     flagging_mode="auto",
 )
+goai_tts = gr.Interface(
+    fn=goai_tts2.goai_ttt_tts,
+    inputs=[
+        gr.Text(label="Texte à traduire", lines=2, value="Par cette ouverture, le centre se veut contribuer à la formation professionnelle des jeunes et des femmes, renforcer les capacités des acteurs du monde agricole, et contribuer à la lutte contre le chômage au Burkina Faso."),
+        gr.Dropdown(label="Voix", choices=audio_files, value="exple_voix_masculine.wav"),
+        gr.Audio(label="Cloner votre voix (optionel)", type="numpy", format="wav"),
+    ],
+    outputs=[
+        gr.Text(label="Texte traduit"),
+        gr.Audio(label="Audio original généré", format="wav"),
+        gr.Audio(label="Denoised Audio", format='wav'),
+        gr.Audio(label="Enhanced Audio", format='wav')
+    ],
+    examples=[["Ils vont bien, merci. Mon père travaille dur dans les champs et ma mère est toujours occupée à la maison.", "exple_voix_masculine.wav", None],
+              ["La finale s’est jouée en présence du Président du Faso, Ibrahim Traoré.", "exple_voix_feminine.wav", None],
+              ["Les enfants apprennent les danses traditionnelles de leurs ancêtres, jouent à des jeux traditionnels dans les rues et aident leurs parents dans les tâches quotidiennes.", "exple_voix_masculine.wav", None],
+              ["Ils achetèrent des troupeaux, firent construire des cases, parcoururent tout le pays pour offrir à leur mère et à leurs femmes les plus beaux bijoux, les plus belles étoffes.", "exple_voix_feminine.wav", None]
+             ],
+    cache_examples=False,
+    title="Démo des Modèles pour le Mooré: Traduction (Text-to-Text) et Synthèse Vocale (Text-to-Speech)",
+    description=DESCRIPTION,
+)
+goai_traduction = gr.Interface(
+    fn=goai_traduction.goai_traduction,
+    inputs=[
+        gr.Textbox(label="Texte", placeholder="Yaa sõama"),
+        gr.Dropdown(label="Langue source", choices=["fra_Latn", "mos_Latn"], value='fra_Latn'),
+        gr.Dropdown(label="Langue cible", choices=["fra_Latn", "mos_Latn"], value='mos_Latn')
+    ],
+    outputs=["text"],
+    examples=[["Yʋʋm a wãn la b kẽesd biig lekolle?", "mos_Latn", "fra_Latn"],
+              ["Zak-soab la kasma.", "mos_Latn", "fra_Latn"],
+              ["Le gouvernement avait pris des mesures louables par rapport à l’augmentation des prix de certaines denrées alimentaires.", "fra_Latn", "mos_Latn"],
+              ["Comme lors du match face à la Côte d’Ivoire, c’est sur un coup de pied arrêté que les Etalons encaissent leur but.", "fra_Latn", "mos_Latn"],
+    ],
+    cache_examples=False,
+    title="Traduction du Mooré: texte vers texte",
+    description=DESCRIPTION
+)
 with demo:
     gr.TabbedInterface(
+        interface_list=[goai_stt, goai_tts, goai_traduction],
         tab_names=["Microphone & Audio file"]
     )

app_2.py DELETED Viewed

@@ -1,140 +0,0 @@
-import os
-import torch
-import torchaudio
-import scipy
-import gradio as gr
-from transformers import set_seed, pipeline
-from datasets import load_dataset, Audio
-import goai_traduction, goai_stt, goai_stt2, goai_tts,  goai_tts2
-#language_list = ['mos', 'fra', 'eng']
-# list all files in the ./audios directory for the dropdown
-audio_files = [f for f in os.listdir('./exples_voix') if os.path.isfile(os.path.join('./exples_voix', f))]
-# device
-device = 0 if torch.cuda.is_available() else "cpu"
-# texte décrivant chaque tab
-description = """<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap;">
-                    <div style="flex: 1; min-width: 250px;">
-                        Ce modèle de traduction vers la <b>langue Mooré</b> a été développé from scratch par <b>GO AI CORP</b> et la version disponible en test est celle à 700 millions de paramètres.
-                        <br><br>
-                        Pour les détails techniques sur l'architecture du modèle, prendre attache avec nous via WhatsApp au <b>+226 66 62 83 03</b>.
-                    </div>
-                    <div style="flex-shrink: 0; min-width: 150px; text-align: center;">
-                        <img src="https://github.com/ANYANTUDRE/Stage-IA-Selever-GO-AI-Corp/blob/main/img/goaicorp-logo2.jpg?raw=true" width="300px" style="max-width: 100%; height: auto;">
-                    </div>
-                </div>
-                """
-# gradio app
-demo = gr.Blocks()
-goai_stt = gr.Interface(
-    fn = goai_stt.goai_stt,
-    inputs=[
-        #gr.Audio(sources=["microphone", "upload"], type="numpy")
-        gr.Audio(sources=["upload"], type="numpy")
-    ],
-    outputs="text",
-    examples=[["./audios/example1.mp3", "a ye ligdi"],
-              ["./audios/example2.mp3", "zoe nimbãanega"],
-              ["./audios/example3.mp3", "zãng-zãnga"],
-              ["./audios/example4.mp3", "yõk foto"]
-             ],
-     cache_examples=False,
-    title="Traduction du Mooré: texte vers texte",
-    description=description
-)
-goai_stt2 = gr.Interface(
-    fn = goai_stt2.goai_stt2,
-    inputs=[
-        #gr.Audio(sources=["microphone", "upload"], type="numpy")
-        gr.Audio(sources=["upload"], type="numpy")
-    ],
-    outputs="text",
-    examples=[["./audios/example1.mp3", "a ye ligdi"],
-              ["./audios/example2.mp3", "zoe nimbãanega"],
-              ["./audios/example3.mp3", "zãng-zãnga"],
-              ["./audios/example4.mp3", "yõk foto"]
-             ],
-     cache_examples=False,
-    title="Traduction du Mooré: texte vers texte",
-    description=description
-)
-goai_tts = gr.Interface(
-    fn=goai_tts.goai_tts,
-    inputs=[
-        gr.Text(label="Texte", placeholder="a ye ligdi")
-    ],
-    outputs=[
-        gr.Audio(label="Audio généré", type="numpy")
-    ],
-    examples=[["a ye ligdi"],
-              ["zoe nimbãanega "],
-              ["zãng-zãnga"],
-              ["yõk foto"]
-             ],
-    cache_examples=False,
-    title="Traduction du Mooré: texte vers texte",
-    description=description
-)
-goai_tts2 = gr.Interface(
-    fn=goai_tts2.goai_ttt_tts,
-    inputs=[
-        gr.Text(label="Texte à traduire", lines=2, value="Par cette ouverture, le centre se veut contribuer à la formation professionnelle des jeunes et des femmes, renforcer les capacités des acteurs du monde agricole, et contribuer à la lutte contre le chômage au Burkina Faso."),
-        gr.Dropdown(label="Voix", choices=audio_files, value="exple_voix_masculine.wav"),
-        gr.Audio(label="Cloner votre voix (optionel)", type="numpy", format="wav"),
-    ],
-    outputs=[
-        gr.Text(label="Texte traduit"),
-        gr.Audio(label="Audio original généré", format="wav"),
-        gr.Audio(label="Denoised Audio", format='wav'),
-        gr.Audio(label="Enhanced Audio", format='wav')
-    ],
-    examples=[["Ils vont bien, merci. Mon père travaille dur dans les champs et ma mère est toujours occupée à la maison.", "exple_voix_masculine.wav", None],
-              ["La finale s’est jouée en présence du Président du Faso, Ibrahim Traoré.", "exple_voix_feminine.wav", None],
-              ["Les enfants apprennent les danses traditionnelles de leurs ancêtres, jouent à des jeux traditionnels dans les rues et aident leurs parents dans les tâches quotidiennes.", "exple_voix_masculine.wav", None],
-              ["Ils achetèrent des troupeaux, firent construire des cases, parcoururent tout le pays pour offrir à leur mère et à leurs femmes les plus beaux bijoux, les plus belles étoffes.", "exple_voix_feminine.wav", None]
-             ],
-    cache_examples=False,
-    title="Démo des Modèles pour le Mooré: Traduction (Text-to-Text) et Synthèse Vocale (Text-to-Speech)",
-    description=description,
-)
-goai_traduction = gr.Interface(
-    fn=goai_traduction.goai_traduction,
-    inputs=[
-        gr.Textbox(label="Texte", placeholder="Yaa sõama"),
-        gr.Dropdown(label="Langue source", choices=["fra_Latn", "mos_Latn"], value='fra_Latn'),
-        gr.Dropdown(label="Langue cible", choices=["fra_Latn", "mos_Latn"], value='mos_Latn')
-    ],
-    outputs=["text"],
-    examples=[["Yʋʋm a wãn la b kẽesd biig lekolle?", "mos_Latn", "fra_Latn"],
-              ["Zak-soab la kasma.", "mos_Latn", "fra_Latn"],
-              ["Le gouvernement avait pris des mesures louables par rapport à l’augmentation des prix de certaines denrées alimentaires.", "fra_Latn", "mos_Latn"],
-              ["Comme lors du match face à la Côte d’Ivoire, c’est sur un coup de pied arrêté que les Etalons encaissent leur but.", "fra_Latn", "mos_Latn"],
-    ],
-    cache_examples=False,
-    title="Traduction du Mooré: texte vers texte",
-    description=description
-)
-with demo:
-    gr.TabbedInterface(
-        #[goai_traduction, goai_tts, goai_stt, goai_tts2],
-        #["Traduction", "Text-2-speech", "Speech-2-text", "Text-2-speech-2"],
-        [goai_tts2, goai_stt, goai_stt2],
-        ["Traduction et Synthèse vocale du Mooré", "Speech-2-text", "Speech-2-text-Whisper"],
-    )
-demo.launch()

goai_helpers/__init__.py ADDED Viewed

File without changes

goai_stt.py → goai_helpers/goai_stt.py RENAMED Viewed

File without changes

goai_helpers/goai_stt2.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import torch
+import spaces
+from transformers import pipeline
+DEVICE = 0 if torch.cuda.is_available() else "cpu"
+@spaces.GPU
+def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s):
+    if inputs is None:
+        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
+    pipe = pipeline(
+        task="automatic-speech-recognition",
+        model=model,
+        chunk_length_s=chunk_length_s,
+        stride_length_s=stride_length_s,
+        device=DEVICE,
+    )
+    forced_decoder_ids = None
+    if model.endswith(".en") == False and language in LANG_TO_ID:
+        forced_decoder_ids = [[2, LANG_TO_ID[language]]]  # Setting forced decoder for language
+    generate_kwargs = {}
+    if forced_decoder_ids:
+        generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
+    output = pipe(inputs, batch_size=batch_size, **generate_kwargs)
+    transcription_text = output['text']
+    transcription_file_path = "transcription.txt"
+    with open(transcription_file_path, "w") as f:
+        f.write(transcription_text)
+    return transcription_text, transcription_file_path

goai_traduction.py → goai_helpers/goai_traduction.py RENAMED Viewed

File without changes

goai_tts.py → goai_helpers/goai_tts.py RENAMED Viewed

@@ -6,9 +6,11 @@ import numpy as np
 from transformers import pipeline, set_seed
 from huggingface_hub import login
-#auth_token = os.environ["HF_SPACE_TOKEN"]
 auth_token = os.getenv('HF_SPACE_TOKEN')
 login(token=auth_token)
 @spaces.GPU
 def goai_tts(texte):
     """

 from transformers import pipeline, set_seed
 from huggingface_hub import login
 auth_token = os.getenv('HF_SPACE_TOKEN')
 login(token=auth_token)
 @spaces.GPU
 def goai_tts(texte):
     """

goai_tts2.py → goai_helpers/goai_tts2.py RENAMED Viewed

@@ -18,7 +18,7 @@ from TTS.tts.models.xtts import Xtts
 from resemble_enhance.enhancer.inference import denoise, enhance
 from flore200_codes import flores_codes
-from utils import download_file, diviser_phrases_moore, enhance_speech
 import goai_traduction
 # authentification

 from resemble_enhance.enhancer.inference import denoise, enhance
 from flore200_codes import flores_codes
+from goai_helpers.utils import download_file, diviser_phrases_moore, enhance_speech
 import goai_traduction
 # authentification

utils.py → goai_helpers/utils.py RENAMED Viewed

File without changes

goai_stt2.py DELETED Viewed

@@ -1,32 +0,0 @@
-import torch
-import gradio as gr
-#import yt_dlp as youtube_dl
-from transformers import pipeline
-from transformers.pipelines.audio_utils import ffmpeg_read
-import spaces
-import tempfile
-import os
-MODEL_NAME = "ArissBandoss/whisper-small-mos"
-BATCH_SIZE = 8
-device = 0 if torch.cuda.is_available() else "cpu"
-pipe = pipeline(
-    task="automatic-speech-recognition",
-    model=MODEL_NAME,
-    chunk_length_s=30,
-    device=device,
-)
-@spaces.GPU
-def goai_stt2(inputs, task):
-    print(inputs)
-    if inputs is None:
-        raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
-    text = pipe(inputs[1], batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
-    return  text

whisper_notebook.ipynb DELETED Viewed

@@ -1,192 +0,0 @@
-{
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "provenance": [],
-      "gpuType": "T4"
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "accelerator": "GPU"
-  },
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "source": [
-        "# Whisper v3 is here!\n",
-        "\n",
-        "Whisper v3 is a new model open sourced by OpenAI. The model can do multilingual transcriptions and is quite impressive. For example, you can change from English to Spanish or Chinese in the middle of a sentence and it will work well!\n",
-        "\n",
-        "The model can be run in a free Google Colab instance and is integrated into `transformers` already, so switching can be a very smooth process if you already use the previous versions."
-      ],
-      "metadata": {
-        "id": "OXaUqiE-eyXM"
-      }
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "WFQeUT9EcIcK"
-      },
-      "outputs": [],
-      "source": [
-        "%%capture\n",
-        "!pip install git+https://github.com/huggingface/transformers gradio"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Let's use the high level `pipeline` from the `transformers` library to load the model."
-      ],
-      "metadata": {
-        "id": "sZONes21fHTA"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import torch\n",
-        "from transformers import pipeline\n",
-        "\n",
-        "pipe = pipeline(\"automatic-speech-recognition\",\n",
-        "               \"openai/whisper-large-v3\",\n",
-        "               torch_dtype=torch.float16,\n",
-        "               device=\"cuda:0\")"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "DvBdwMdPcr-Y",
-        "outputId": "47f32218-fd85-49ea-d880-d31577bcf9b8"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stderr",
-          "text": [
-            "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
-            "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
-          ]
-        }
-      ]
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "pipe(\"https://cdn-media.huggingface.co/speech_samples/sample1.flac\")"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "GZFkIyhjc0Nc",
-        "outputId": "f1463431-3e08-4438-815f-b71e5e7a1503"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": [
-              "{'text': \" going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight he'll have to put in an appearance at some place of worship on sunday morning and he can come to us immediately afterwards\"}"
-            ]
-          },
-          "metadata": {},
-          "execution_count": 2
-        }
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "source": [
-        "Let's now build a quick Gradio demo where we can play with the model directly using our microphone! You can run this code in a Google Colab instance (or locally!) or just head to the <a href=\"https://huggingface.co/spaces/hf-audio/whisper-large-v3\" target=\"_blank\">Space</a> to play directly with it online."
-      ],
-      "metadata": {
-        "id": "pt3YtM_PfTQY"
-      }
-    },
-    {
-      "cell_type": "code",
-      "source": [
-        "import gradio as gr\n",
-        "\n",
-        "def transcribe(inputs):\n",
-        "    if inputs is None:\n",
-        "        raise gr.Error(\"No audio file submitted! Please record an audio before submitting your request.\")\n",
-        "\n",
-        "    text = pipe(inputs, generate_kwargs={\"task\": \"transcribe\"}, return_timestamps=True)[\"text\"]\n",
-        "    return text\n",
-        "\n",
-        "demo = gr.Interface(\n",
-        "    fn=transcribe,\n",
-        "    inputs=[\n",
-        "        gr.Audio(sources=[\"microphone\", \"upload\"], type=\"filepath\"),\n",
-        "    ],\n",
-        "    outputs=\"text\",\n",
-        "    title=\"Whisper Large V3: Transcribe Audio\",\n",
-        "    description=(\n",
-        "        \"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the\"\n",
-        "        \" checkpoint [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) and 🤗 Transformers to transcribe audio files\"\n",
-        "        \" of arbitrary length.\"\n",
-        "    ),\n",
-        "    allow_flagging=\"never\",\n",
-        ")\n",
-        "\n",
-        "demo.launch()\n"
-      ],
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 648
-        },
-        "id": "K0b2UZLVdIze",
-        "outputId": "bcff00e0-4fc8-4883-9ba4-480f5a6665f0"
-      },
-      "execution_count": null,
-      "outputs": [
-        {
-          "output_type": "stream",
-          "name": "stdout",
-          "text": [
-            "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n",
-            "\n",
-            "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n",
-            "Running on public URL: https://037dbdb04542aa1a29.gradio.live\n",
-            "\n",
-            "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
-          ]
-        },
-        {
-          "output_type": "display_data",
-          "data": {
-            "text/plain": [
-              "<IPython.core.display.HTML object>"
-            ],
-            "text/html": [
-              "<div><iframe src=\"https://037dbdb04542aa1a29.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
-            ]
-          },
-          "metadata": {}
-        },
-        {
-          "output_type": "execute_result",
-          "data": {
-            "text/plain": []
-          },
-          "metadata": {},
-          "execution_count": 4
-        }
-      ]
-    }
-  ]
-}