ANYANTUDRE commited on
Commit
21f74cc
·
1 Parent(s): 78d1101

refactor code

Browse files
app.py CHANGED
@@ -1,108 +1,65 @@
1
  import spaces
2
  import torch
 
 
3
 
4
  import gradio as gr
5
- from transformers import pipeline
6
- from transformers.pipelines.audio_utils import ffmpeg_read
7
 
8
- import tempfile
9
  import os
10
- import time
11
- import requests
12
-
13
  from languages import get_language_names
14
- from subtitle import text_output, subtitle_output
15
 
16
- from huggingface_hub import login
17
 
18
  auth_token = os.getenv('HF_SPACE_TOKEN')
19
  login(token=auth_token)
20
 
21
 
22
- try:
23
- import spaces
24
- USING_SPACES = True
25
- except ImportError:
26
- USING_SPACES = False
27
-
28
- def gpu_decorator(func):
29
- if USING_SPACES:
30
- return spaces.GPU(func)
31
- else:
32
- return func
33
-
34
-
35
- device = 0 if torch.cuda.is_available() else "cpu"
36
-
37
- @gpu_decorator
38
- def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s):
39
- if inputs is None:
40
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
41
-
42
- pipe = pipeline(
43
- task="automatic-speech-recognition",
44
- model=model,
45
- chunk_length_s=chunk_length_s,
46
- stride_length_s=stride_length_s,
47
- device=device,
48
- )
 
 
 
 
49
 
50
- # Whisper's full language ID mapping
51
- lang_to_id = {
52
- "en": 0, "zh": 1, "de": 2, "es": 3, "ru": 4, "ko": 5, "fr": 6, "ja": 7,
53
- "pt": 8, "tr": 9, "pl": 10, "ca": 11, "nl": 12, "ar": 13, "sv": 14,
54
- "it": 15, "id": 16, "hi": 17, "fi": 18, "vi": 19, "he": 20, "uk": 21,
55
- "el": 22, "ms": 23, "cs": 24, "ro": 25, "da": 26, "hu": 27, "ta": 28,
56
- "no": 29, "th": 30, "ur": 31, "hr": 32, "bg": 33, "lt": 34, "la": 35,
57
- "mi": 36, "ml": 37, "cy": 38, "sk": 39, "te": 40, "fa": 41, "lv": 42,
58
- "bn": 43, "sr": 44, "az": 45, "sl": 46, "kn": 47, "et": 48, "mk": 49,
59
- "br": 50, "eu": 51, "is": 52, "hy": 53, "ne": 54, "mn": 55, "bs": 56,
60
- "kk": 57, "sq": 58, "sw": 59, "gl": 60, "mr": 61, "pa": 62, "si": 63,
61
- "km": 64, "sn": 65, "yo": 66, "so": 67, "af": 68, "oc": 69, "ka": 70,
62
- "be": 71, "tg": 72, "sd": 73, "gu": 74, "am": 75, "yi": 76, "lo": 77,
63
- "uz": 78, "fo": 79, "ht": 80, "ps": 81, "tk": 82, "nn": 83, "mt": 84,
64
- "sa": 85, "lb": 86, "my": 87, "bo": 88, "tl": 89, "mg": 90, "as": 91,
65
- "tt": 92, "haw": 93, "ln": 94, "ha": 95, "ba": 96, "jw": 97, "su": 98
66
- }
67
-
68
- forced_decoder_ids = None
69
- if model.endswith(".en") == False and language in lang_to_id:
70
- forced_decoder_ids = [[2, lang_to_id[language]]] # Setting forced decoder for language
71
-
72
- generate_kwargs = {}
73
- if forced_decoder_ids:
74
- generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
75
-
76
- #if model.endswith(".en") == False:
77
- #generate_kwargs["task"] = task
78
-
79
- output = pipe(inputs, batch_size=batch_size, **generate_kwargs)
80
-
81
- transcription_text = output['text']
82
-
83
- transcription_file_path = "transcription.txt"
84
- with open(transcription_file_path, "w") as f:
85
- f.write(transcription_text)
86
-
87
- return transcription_text, transcription_file_path
88
 
89
 
90
  demo = gr.Blocks(theme=gr.themes.Ocean())
91
 
92
- mf_transcribe = gr.Interface(
93
- fn=transcribe,
94
  inputs=[
95
  gr.Audio(sources=["microphone", "upload"], type="filepath"),
96
  gr.Dropdown(
97
  choices=[
98
  "ArissBandoss/whisper-small-mos",
99
- #"openai/whisper-tiny",
100
- #"openai/whisper-base",
101
- #"openai/whisper-small",
102
- #"openai/whisper-medium",
103
- "openai/whisper-large",
104
- #"openai/whisper-large-v1",
105
- #"openai/whisper-large-v3", "openai/whisper-large-v3-turbo", "distil-whisper/distil-large-v3", "xaviviro/whisper-large-v3-catalan-finetuned-v2",
106
  ],
107
  value="ArissBandoss/whisper-small-mos",
108
  label="Model Name"
@@ -113,15 +70,62 @@ mf_transcribe = gr.Interface(
113
  gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
114
  ],
115
  outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
116
- title="Whisper Large V3 Turbo: Transcribe Audio",
117
- description=("Transcribe long-form microphone or audio inputs with the click of a button!"),
 
 
 
 
 
 
118
  flagging_mode="auto",
119
  )
120
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  with demo:
123
  gr.TabbedInterface(
124
- interface_list=[mf_transcribe],
125
  tab_names=["Microphone & Audio file"]
126
  )
127
 
 
1
  import spaces
2
  import torch
3
+ import scipy
4
+ import torchaudio
5
 
6
  import gradio as gr
7
+ from transformers import pipeline, set_seed
8
+ from huggingface_hub import login
9
 
 
10
  import os
 
 
 
11
  from languages import get_language_names
12
+ from goai_helpers import goai_traduction, goai_stt, goai_stt2, goai_tts, goai_tts2
13
 
 
14
 
15
  auth_token = os.getenv('HF_SPACE_TOKEN')
16
  login(token=auth_token)
17
 
18
 
19
+ # list all files in the ./audios directory for the dropdown
20
+ AUDIO_FILES = [f for f in os.listdir('./exples_voix') if os.path.isfile(os.path.join('./exples_voix', f))]
21
+
22
+ DESCRIPTION = """<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap;">
23
+ <div style="flex: 1; min-width: 250px;">
24
+ Ce modèle de traduction vers la <b>langue Mooré</b> a été développé from scratch par <b>GO AI CORP</b> et la version disponible en test est celle à 700 millions de paramètres.
25
+ <br><br>
26
+ Pour les détails techniques sur l'architecture du modèle, prendre attache avec nous via WhatsApp au <b>+226 66 62 83 03</b>.
27
+ </div>
28
+ <div style="flex-shrink: 0; min-width: 150px; text-align: center;">
29
+ <img src="https://github.com/ANYANTUDRE/Stage-IA-Selever-GO-AI-Corp/blob/main/img/goaicorp-logo2.jpg?raw=true" width="300px" style="max-width: 100%; height: auto;">
30
+ </div>
31
+ </div>
32
+ """
33
+ # Whisper's full language ID mapping
34
+ LANG_TO_ID = {
35
+ "en": 0, "zh": 1, "de": 2, "es": 3, "ru": 4, "ko": 5, "fr": 6, "ja": 7,
36
+ "pt": 8, "tr": 9, "pl": 10, "ca": 11, "nl": 12, "ar": 13, "sv": 14,
37
+ "it": 15, "id": 16, "hi": 17, "fi": 18, "vi": 19, "he": 20, "uk": 21,
38
+ "el": 22, "ms": 23, "cs": 24, "ro": 25, "da": 26, "hu": 27, "ta": 28,
39
+ "no": 29, "th": 30, "ur": 31, "hr": 32, "bg": 33, "lt": 34, "la": 35,
40
+ "mi": 36, "ml": 37, "cy": 38, "sk": 39, "te": 40, "fa": 41, "lv": 42,
41
+ "bn": 43, "sr": 44, "az": 45, "sl": 46, "kn": 47, "et": 48, "mk": 49,
42
+ "br": 50, "eu": 51, "is": 52, "hy": 53, "ne": 54, "mn": 55, "bs": 56,
43
+ "kk": 57, "sq": 58, "sw": 59, "gl": 60, "mr": 61, "pa": 62, "si": 63,
44
+ "km": 64, "sn": 65, "yo": 66, "so": 67, "af": 68, "oc": 69, "ka": 70,
45
+ "be": 71, "tg": 72, "sd": 73, "gu": 74, "am": 75, "yi": 76, "lo": 77,
46
+ "uz": 78, "fo": 79, "ht": 80, "ps": 81, "tk": 82, "nn": 83, "mt": 84,
47
+ "sa": 85, "lb": 86, "my": 87, "bo": 88, "tl": 89, "mg": 90, "as": 91,
48
+ "tt": 92, "haw": 93, "ln": 94, "ha": 95, "ba": 96, "jw": 97, "su": 98
49
+ }
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
 
53
  demo = gr.Blocks(theme=gr.themes.Ocean())
54
 
55
+ goai_stt = gr.Interface(
56
+ fn=goai_stt2.goai_stt2
57
  inputs=[
58
  gr.Audio(sources=["microphone", "upload"], type="filepath"),
59
  gr.Dropdown(
60
  choices=[
61
  "ArissBandoss/whisper-small-mos",
62
+ "openai/whisper-large-v3-turbo",
 
 
 
 
 
 
63
  ],
64
  value="ArissBandoss/whisper-small-mos",
65
  label="Model Name"
 
70
  gr.Slider(label="Stride Length (s)", minimum=1, maximum=30, value=1, step=0.1),
71
  ],
72
  outputs=[gr.Textbox(label="Output"), gr.File(label="Download Files")],
73
+ examples=[["./audios/example1.mp3", "a ye ligdi"],
74
+ ["./audios/example2.mp3", "zoe nimbãanega"],
75
+ ["./audios/example3.mp3", "zãng-zãnga"],
76
+ ["./audios/example4.mp3", "yõk foto"]
77
+ ],
78
+ cache_examples=False,
79
+ title="Mooré ASR: Transcribe Audio",
80
+ description=DESCRIPTION,
81
  flagging_mode="auto",
82
  )
83
 
84
+ goai_tts = gr.Interface(
85
+ fn=goai_tts2.goai_ttt_tts,
86
+ inputs=[
87
+ gr.Text(label="Texte à traduire", lines=2, value="Par cette ouverture, le centre se veut contribuer à la formation professionnelle des jeunes et des femmes, renforcer les capacités des acteurs du monde agricole, et contribuer à la lutte contre le chômage au Burkina Faso."),
88
+ gr.Dropdown(label="Voix", choices=audio_files, value="exple_voix_masculine.wav"),
89
+ gr.Audio(label="Cloner votre voix (optionel)", type="numpy", format="wav"),
90
+ ],
91
+ outputs=[
92
+ gr.Text(label="Texte traduit"),
93
+ gr.Audio(label="Audio original généré", format="wav"),
94
+ gr.Audio(label="Denoised Audio", format='wav'),
95
+ gr.Audio(label="Enhanced Audio", format='wav')
96
+ ],
97
+ examples=[["Ils vont bien, merci. Mon père travaille dur dans les champs et ma mère est toujours occupée à la maison.", "exple_voix_masculine.wav", None],
98
+ ["La finale s’est jouée en présence du Président du Faso, Ibrahim Traoré.", "exple_voix_feminine.wav", None],
99
+ ["Les enfants apprennent les danses traditionnelles de leurs ancêtres, jouent à des jeux traditionnels dans les rues et aident leurs parents dans les tâches quotidiennes.", "exple_voix_masculine.wav", None],
100
+ ["Ils achetèrent des troupeaux, firent construire des cases, parcoururent tout le pays pour offrir à leur mère et à leurs femmes les plus beaux bijoux, les plus belles étoffes.", "exple_voix_feminine.wav", None]
101
+ ],
102
+ cache_examples=False,
103
+ title="Démo des Modèles pour le Mooré: Traduction (Text-to-Text) et Synthèse Vocale (Text-to-Speech)",
104
+ description=DESCRIPTION,
105
+ )
106
+
107
+ goai_traduction = gr.Interface(
108
+ fn=goai_traduction.goai_traduction,
109
+ inputs=[
110
+ gr.Textbox(label="Texte", placeholder="Yaa sõama"),
111
+ gr.Dropdown(label="Langue source", choices=["fra_Latn", "mos_Latn"], value='fra_Latn'),
112
+ gr.Dropdown(label="Langue cible", choices=["fra_Latn", "mos_Latn"], value='mos_Latn')
113
+ ],
114
+ outputs=["text"],
115
+ examples=[["Yʋʋm a wãn la b kẽesd biig lekolle?", "mos_Latn", "fra_Latn"],
116
+ ["Zak-soab la kasma.", "mos_Latn", "fra_Latn"],
117
+ ["Le gouvernement avait pris des mesures louables par rapport à l’augmentation des prix de certaines denrées alimentaires.", "fra_Latn", "mos_Latn"],
118
+ ["Comme lors du match face à la Côte d’Ivoire, c’est sur un coup de pied arrêté que les Etalons encaissent leur but.", "fra_Latn", "mos_Latn"],
119
+ ],
120
+ cache_examples=False,
121
+ title="Traduction du Mooré: texte vers texte",
122
+ description=DESCRIPTION
123
+ )
124
+
125
 
126
  with demo:
127
  gr.TabbedInterface(
128
+ interface_list=[goai_stt, goai_tts, goai_traduction],
129
  tab_names=["Microphone & Audio file"]
130
  )
131
 
app_2.py DELETED
@@ -1,140 +0,0 @@
1
- import os
2
- import torch
3
- import torchaudio
4
- import scipy
5
-
6
- import gradio as gr
7
- from transformers import set_seed, pipeline
8
- from datasets import load_dataset, Audio
9
-
10
- import goai_traduction, goai_stt, goai_stt2, goai_tts, goai_tts2
11
-
12
- #language_list = ['mos', 'fra', 'eng']
13
-
14
- # list all files in the ./audios directory for the dropdown
15
- audio_files = [f for f in os.listdir('./exples_voix') if os.path.isfile(os.path.join('./exples_voix', f))]
16
-
17
- # device
18
- device = 0 if torch.cuda.is_available() else "cpu"
19
-
20
- # texte décrivant chaque tab
21
- description = """<div style="display: flex; justify-content: space-between; align-items: center; flex-wrap: wrap;">
22
- <div style="flex: 1; min-width: 250px;">
23
- Ce modèle de traduction vers la <b>langue Mooré</b> a été développé from scratch par <b>GO AI CORP</b> et la version disponible en test est celle à 700 millions de paramètres.
24
- <br><br>
25
- Pour les détails techniques sur l'architecture du modèle, prendre attache avec nous via WhatsApp au <b>+226 66 62 83 03</b>.
26
- </div>
27
- <div style="flex-shrink: 0; min-width: 150px; text-align: center;">
28
- <img src="https://github.com/ANYANTUDRE/Stage-IA-Selever-GO-AI-Corp/blob/main/img/goaicorp-logo2.jpg?raw=true" width="300px" style="max-width: 100%; height: auto;">
29
- </div>
30
- </div>
31
- """
32
-
33
- # gradio app
34
- demo = gr.Blocks()
35
-
36
- goai_stt = gr.Interface(
37
- fn = goai_stt.goai_stt,
38
- inputs=[
39
- #gr.Audio(sources=["microphone", "upload"], type="numpy")
40
- gr.Audio(sources=["upload"], type="numpy")
41
- ],
42
- outputs="text",
43
- examples=[["./audios/example1.mp3", "a ye ligdi"],
44
- ["./audios/example2.mp3", "zoe nimbãanega"],
45
- ["./audios/example3.mp3", "zãng-zãnga"],
46
- ["./audios/example4.mp3", "yõk foto"]
47
- ],
48
- cache_examples=False,
49
- title="Traduction du Mooré: texte vers texte",
50
- description=description
51
- )
52
-
53
-
54
- goai_stt2 = gr.Interface(
55
- fn = goai_stt2.goai_stt2,
56
- inputs=[
57
- #gr.Audio(sources=["microphone", "upload"], type="numpy")
58
- gr.Audio(sources=["upload"], type="numpy")
59
- ],
60
- outputs="text",
61
- examples=[["./audios/example1.mp3", "a ye ligdi"],
62
- ["./audios/example2.mp3", "zoe nimbãanega"],
63
- ["./audios/example3.mp3", "zãng-zãnga"],
64
- ["./audios/example4.mp3", "yõk foto"]
65
- ],
66
- cache_examples=False,
67
- title="Traduction du Mooré: texte vers texte",
68
- description=description
69
- )
70
-
71
-
72
- goai_tts = gr.Interface(
73
- fn=goai_tts.goai_tts,
74
- inputs=[
75
- gr.Text(label="Texte", placeholder="a ye ligdi")
76
- ],
77
- outputs=[
78
- gr.Audio(label="Audio généré", type="numpy")
79
- ],
80
- examples=[["a ye ligdi"],
81
- ["zoe nimbãanega "],
82
- ["zãng-zãnga"],
83
- ["yõk foto"]
84
- ],
85
- cache_examples=False,
86
- title="Traduction du Mooré: texte vers texte",
87
- description=description
88
- )
89
-
90
- goai_tts2 = gr.Interface(
91
- fn=goai_tts2.goai_ttt_tts,
92
- inputs=[
93
- gr.Text(label="Texte à traduire", lines=2, value="Par cette ouverture, le centre se veut contribuer à la formation professionnelle des jeunes et des femmes, renforcer les capacités des acteurs du monde agricole, et contribuer à la lutte contre le chômage au Burkina Faso."),
94
- gr.Dropdown(label="Voix", choices=audio_files, value="exple_voix_masculine.wav"),
95
- gr.Audio(label="Cloner votre voix (optionel)", type="numpy", format="wav"),
96
- ],
97
- outputs=[
98
- gr.Text(label="Texte traduit"),
99
- gr.Audio(label="Audio original généré", format="wav"),
100
- gr.Audio(label="Denoised Audio", format='wav'),
101
- gr.Audio(label="Enhanced Audio", format='wav')
102
- ],
103
- examples=[["Ils vont bien, merci. Mon père travaille dur dans les champs et ma mère est toujours occupée à la maison.", "exple_voix_masculine.wav", None],
104
- ["La finale s’est jouée en présence du Président du Faso, Ibrahim Traoré.", "exple_voix_feminine.wav", None],
105
- ["Les enfants apprennent les danses traditionnelles de leurs ancêtres, jouent à des jeux traditionnels dans les rues et aident leurs parents dans les tâches quotidiennes.", "exple_voix_masculine.wav", None],
106
- ["Ils achetèrent des troupeaux, firent construire des cases, parcoururent tout le pays pour offrir à leur mère et à leurs femmes les plus beaux bijoux, les plus belles étoffes.", "exple_voix_feminine.wav", None]
107
- ],
108
- cache_examples=False,
109
- title="Démo des Modèles pour le Mooré: Traduction (Text-to-Text) et Synthèse Vocale (Text-to-Speech)",
110
- description=description,
111
- )
112
-
113
- goai_traduction = gr.Interface(
114
- fn=goai_traduction.goai_traduction,
115
- inputs=[
116
- gr.Textbox(label="Texte", placeholder="Yaa sõama"),
117
- gr.Dropdown(label="Langue source", choices=["fra_Latn", "mos_Latn"], value='fra_Latn'),
118
- gr.Dropdown(label="Langue cible", choices=["fra_Latn", "mos_Latn"], value='mos_Latn')
119
- ],
120
- outputs=["text"],
121
- examples=[["Yʋʋm a wãn la b kẽesd biig lekolle?", "mos_Latn", "fra_Latn"],
122
- ["Zak-soab la kasma.", "mos_Latn", "fra_Latn"],
123
- ["Le gouvernement avait pris des mesures louables par rapport à l’augmentation des prix de certaines denrées alimentaires.", "fra_Latn", "mos_Latn"],
124
- ["Comme lors du match face à la Côte d’Ivoire, c’est sur un coup de pied arrêté que les Etalons encaissent leur but.", "fra_Latn", "mos_Latn"],
125
- ],
126
- cache_examples=False,
127
- title="Traduction du Mooré: texte vers texte",
128
- description=description
129
- )
130
-
131
-
132
- with demo:
133
- gr.TabbedInterface(
134
- #[goai_traduction, goai_tts, goai_stt, goai_tts2],
135
- #["Traduction", "Text-2-speech", "Speech-2-text", "Text-2-speech-2"],
136
- [goai_tts2, goai_stt, goai_stt2],
137
- ["Traduction et Synthèse vocale du Mooré", "Speech-2-text", "Speech-2-text-Whisper"],
138
- )
139
-
140
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
goai_helpers/__init__.py ADDED
File without changes
goai_stt.py → goai_helpers/goai_stt.py RENAMED
File without changes
goai_helpers/goai_stt2.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import spaces
3
+ from transformers import pipeline
4
+
5
+ DEVICE = 0 if torch.cuda.is_available() else "cpu"
6
+
7
+
8
+ @spaces.GPU
9
+ def transcribe(inputs, model, language, batch_size, chunk_length_s, stride_length_s):
10
+ if inputs is None:
11
+ raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
12
+
13
+ pipe = pipeline(
14
+ task="automatic-speech-recognition",
15
+ model=model,
16
+ chunk_length_s=chunk_length_s,
17
+ stride_length_s=stride_length_s,
18
+ device=DEVICE,
19
+ )
20
+
21
+
22
+ forced_decoder_ids = None
23
+ if model.endswith(".en") == False and language in LANG_TO_ID:
24
+ forced_decoder_ids = [[2, LANG_TO_ID[language]]] # Setting forced decoder for language
25
+
26
+ generate_kwargs = {}
27
+ if forced_decoder_ids:
28
+ generate_kwargs["forced_decoder_ids"] = forced_decoder_ids
29
+
30
+ output = pipe(inputs, batch_size=batch_size, **generate_kwargs)
31
+
32
+ transcription_text = output['text']
33
+
34
+ transcription_file_path = "transcription.txt"
35
+ with open(transcription_file_path, "w") as f:
36
+ f.write(transcription_text)
37
+
38
+ return transcription_text, transcription_file_path
goai_traduction.py → goai_helpers/goai_traduction.py RENAMED
File without changes
goai_tts.py → goai_helpers/goai_tts.py RENAMED
@@ -6,9 +6,11 @@ import numpy as np
6
  from transformers import pipeline, set_seed
7
  from huggingface_hub import login
8
 
9
- #auth_token = os.environ["HF_SPACE_TOKEN"]
10
  auth_token = os.getenv('HF_SPACE_TOKEN')
11
  login(token=auth_token)
 
 
12
  @spaces.GPU
13
  def goai_tts(texte):
14
  """
 
6
  from transformers import pipeline, set_seed
7
  from huggingface_hub import login
8
 
9
+
10
  auth_token = os.getenv('HF_SPACE_TOKEN')
11
  login(token=auth_token)
12
+
13
+
14
  @spaces.GPU
15
  def goai_tts(texte):
16
  """
goai_tts2.py → goai_helpers/goai_tts2.py RENAMED
@@ -18,7 +18,7 @@ from TTS.tts.models.xtts import Xtts
18
 
19
  from resemble_enhance.enhancer.inference import denoise, enhance
20
  from flore200_codes import flores_codes
21
- from utils import download_file, diviser_phrases_moore, enhance_speech
22
  import goai_traduction
23
 
24
  # authentification
 
18
 
19
  from resemble_enhance.enhancer.inference import denoise, enhance
20
  from flore200_codes import flores_codes
21
+ from goai_helpers.utils import download_file, diviser_phrases_moore, enhance_speech
22
  import goai_traduction
23
 
24
  # authentification
utils.py → goai_helpers/utils.py RENAMED
File without changes
goai_stt2.py DELETED
@@ -1,32 +0,0 @@
1
- import torch
2
-
3
- import gradio as gr
4
- #import yt_dlp as youtube_dl
5
- from transformers import pipeline
6
- from transformers.pipelines.audio_utils import ffmpeg_read
7
-
8
- import spaces
9
- import tempfile
10
- import os
11
-
12
-
13
- MODEL_NAME = "ArissBandoss/whisper-small-mos"
14
- BATCH_SIZE = 8
15
-
16
- device = 0 if torch.cuda.is_available() else "cpu"
17
-
18
- pipe = pipeline(
19
- task="automatic-speech-recognition",
20
- model=MODEL_NAME,
21
- chunk_length_s=30,
22
- device=device,
23
- )
24
-
25
- @spaces.GPU
26
- def goai_stt2(inputs, task):
27
- print(inputs)
28
- if inputs is None:
29
- raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
30
-
31
- text = pipe(inputs[1], batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
32
- return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
whisper_notebook.ipynb DELETED
@@ -1,192 +0,0 @@
1
- {
2
- "nbformat": 4,
3
- "nbformat_minor": 0,
4
- "metadata": {
5
- "colab": {
6
- "provenance": [],
7
- "gpuType": "T4"
8
- },
9
- "kernelspec": {
10
- "name": "python3",
11
- "display_name": "Python 3"
12
- },
13
- "language_info": {
14
- "name": "python"
15
- },
16
- "accelerator": "GPU"
17
- },
18
- "cells": [
19
- {
20
- "cell_type": "markdown",
21
- "source": [
22
- "# Whisper v3 is here!\n",
23
- "\n",
24
- "Whisper v3 is a new model open sourced by OpenAI. The model can do multilingual transcriptions and is quite impressive. For example, you can change from English to Spanish or Chinese in the middle of a sentence and it will work well!\n",
25
- "\n",
26
- "The model can be run in a free Google Colab instance and is integrated into `transformers` already, so switching can be a very smooth process if you already use the previous versions."
27
- ],
28
- "metadata": {
29
- "id": "OXaUqiE-eyXM"
30
- }
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": null,
35
- "metadata": {
36
- "id": "WFQeUT9EcIcK"
37
- },
38
- "outputs": [],
39
- "source": [
40
- "%%capture\n",
41
- "!pip install git+https://github.com/huggingface/transformers gradio"
42
- ]
43
- },
44
- {
45
- "cell_type": "markdown",
46
- "source": [
47
- "Let's use the high level `pipeline` from the `transformers` library to load the model."
48
- ],
49
- "metadata": {
50
- "id": "sZONes21fHTA"
51
- }
52
- },
53
- {
54
- "cell_type": "code",
55
- "source": [
56
- "import torch\n",
57
- "from transformers import pipeline\n",
58
- "\n",
59
- "pipe = pipeline(\"automatic-speech-recognition\",\n",
60
- " \"openai/whisper-large-v3\",\n",
61
- " torch_dtype=torch.float16,\n",
62
- " device=\"cuda:0\")"
63
- ],
64
- "metadata": {
65
- "colab": {
66
- "base_uri": "https://localhost:8080/"
67
- },
68
- "id": "DvBdwMdPcr-Y",
69
- "outputId": "47f32218-fd85-49ea-d880-d31577bcf9b8"
70
- },
71
- "execution_count": null,
72
- "outputs": [
73
- {
74
- "output_type": "stream",
75
- "name": "stderr",
76
- "text": [
77
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
78
- "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
79
- ]
80
- }
81
- ]
82
- },
83
- {
84
- "cell_type": "code",
85
- "source": [
86
- "pipe(\"https://cdn-media.huggingface.co/speech_samples/sample1.flac\")"
87
- ],
88
- "metadata": {
89
- "colab": {
90
- "base_uri": "https://localhost:8080/"
91
- },
92
- "id": "GZFkIyhjc0Nc",
93
- "outputId": "f1463431-3e08-4438-815f-b71e5e7a1503"
94
- },
95
- "execution_count": null,
96
- "outputs": [
97
- {
98
- "output_type": "execute_result",
99
- "data": {
100
- "text/plain": [
101
- "{'text': \" going along slushy country roads and speaking to damp audiences in draughty schoolrooms day after day for a fortnight he'll have to put in an appearance at some place of worship on sunday morning and he can come to us immediately afterwards\"}"
102
- ]
103
- },
104
- "metadata": {},
105
- "execution_count": 2
106
- }
107
- ]
108
- },
109
- {
110
- "cell_type": "markdown",
111
- "source": [
112
- "Let's now build a quick Gradio demo where we can play with the model directly using our microphone! You can run this code in a Google Colab instance (or locally!) or just head to the <a href=\"https://huggingface.co/spaces/hf-audio/whisper-large-v3\" target=\"_blank\">Space</a> to play directly with it online."
113
- ],
114
- "metadata": {
115
- "id": "pt3YtM_PfTQY"
116
- }
117
- },
118
- {
119
- "cell_type": "code",
120
- "source": [
121
- "import gradio as gr\n",
122
- "\n",
123
- "def transcribe(inputs):\n",
124
- " if inputs is None:\n",
125
- " raise gr.Error(\"No audio file submitted! Please record an audio before submitting your request.\")\n",
126
- "\n",
127
- " text = pipe(inputs, generate_kwargs={\"task\": \"transcribe\"}, return_timestamps=True)[\"text\"]\n",
128
- " return text\n",
129
- "\n",
130
- "demo = gr.Interface(\n",
131
- " fn=transcribe,\n",
132
- " inputs=[\n",
133
- " gr.Audio(sources=[\"microphone\", \"upload\"], type=\"filepath\"),\n",
134
- " ],\n",
135
- " outputs=\"text\",\n",
136
- " title=\"Whisper Large V3: Transcribe Audio\",\n",
137
- " description=(\n",
138
- " \"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the\"\n",
139
- " \" checkpoint [openai/whisper-large-v3](https://huggingface.co/openai/whisper-large-v3) and 🤗 Transformers to transcribe audio files\"\n",
140
- " \" of arbitrary length.\"\n",
141
- " ),\n",
142
- " allow_flagging=\"never\",\n",
143
- ")\n",
144
- "\n",
145
- "demo.launch()\n"
146
- ],
147
- "metadata": {
148
- "colab": {
149
- "base_uri": "https://localhost:8080/",
150
- "height": 648
151
- },
152
- "id": "K0b2UZLVdIze",
153
- "outputId": "bcff00e0-4fc8-4883-9ba4-480f5a6665f0"
154
- },
155
- "execution_count": null,
156
- "outputs": [
157
- {
158
- "output_type": "stream",
159
- "name": "stdout",
160
- "text": [
161
- "Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).\n",
162
- "\n",
163
- "Colab notebook detected. To show errors in colab notebook, set debug=True in launch()\n",
164
- "Running on public URL: https://037dbdb04542aa1a29.gradio.live\n",
165
- "\n",
166
- "This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)\n"
167
- ]
168
- },
169
- {
170
- "output_type": "display_data",
171
- "data": {
172
- "text/plain": [
173
- "<IPython.core.display.HTML object>"
174
- ],
175
- "text/html": [
176
- "<div><iframe src=\"https://037dbdb04542aa1a29.gradio.live\" width=\"100%\" height=\"500\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
177
- ]
178
- },
179
- "metadata": {}
180
- },
181
- {
182
- "output_type": "execute_result",
183
- "data": {
184
- "text/plain": []
185
- },
186
- "metadata": {},
187
- "execution_count": 4
188
- }
189
- ]
190
- }
191
- ]
192
- }