File size: 6,787 Bytes
b319757 790a907 b319757 790a907 b319757 3af0797 b319757 790a907 b319757 9f5e5ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import base64
import os
import gradio as gr
from transformers import pipeline
import numpy as np
import librosa
from datetime import datetime
from datasets import (
load_dataset,
concatenate_datasets,
Dataset,
DatasetDict,
Features,
Value,
Audio,
)
# Hugging Face evaluation dataset
HF_DATASET_NAME = "BounharAbdelaziz/Moroccan-STT-Eval-Dataset"
# Models paths
MODEL_PATHS = {
"NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny",
"SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small",
"LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
}
# Access token to models
STT_MODEL_TOKEN = os.environ.get("STT_MODEL_TOKEN")
# Access token to dataset
STT_EVAL_DATASET_TOKEN = os.environ.get("STT_EVAL_DATASET_TOKEN")
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def encode_image_to_base64(image_path):
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
return encoded_string
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def create_html_image(image_path):
img_base64 = encode_image_to_base64(image_path)
html_string = f"""
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
<div style="max-width: 800px; margin: auto;">
<img src="data:image/jpeg;base64,{img_base64}"
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
alt="Displayed Image">
</div>
</div>
"""
return html_string
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def save_to_hf_dataset(audio_signal, model_choice, transcription):
print("[INFO] Loading dataset...")
dataset = load_dataset(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)
print("[INFO] Dataset loaded successfully.")
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
new_entry = {
"audio": [{"array": audio_signal, "sampling_rate": 16000}],
"transcription": [transcription],
"model_used": [model_choice],
"timestamp": [timestamp],
}
new_dataset = Dataset.from_dict(
new_entry,
features=Features({
"audio": Audio(sampling_rate=16000),
"transcription": Value("string"),
"model_used": Value("string"),
"timestamp": Value("string"),
})
)
print("[INFO] Adding the new entry to the dataset...")
train_dataset = dataset["train"]
updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
dataset["train"] = updated_train_dataset
print("[INFO] Pushing the updated dataset...")
dataset.push_to_hub(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)
print("[INFO] Dataset updated and pushed successfully.")
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def load_model(model_name):
model_id = MODEL_PATHS[model_name.upper()]
return pipeline("automatic-speech-recognition", model=model_id, token=STT_MODEL_TOKEN)
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def process_audio(audio, model_choice, save_data):
pipe = load_model(model_choice)
audio_signal = audio[1]
sample_rate = audio[0]
audio_signal = audio_signal.astype(np.float32)
if np.abs(audio_signal).max() > 1.0:
audio_signal = audio_signal / 32768.0
if sample_rate != 16000:
print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
audio_signal = librosa.resample(
y=audio_signal,
orig_sr=sample_rate,
target_sr=16000
)
result = pipe(audio_signal)
transcription = result["text"]
if save_data:
print(f"[INFO] Saving data to eval dataset...")
save_to_hf_dataset(audio_signal, model_choice, transcription)
return transcription
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def create_interface():
with gr.Blocks(css="footer{display:none !important}") as app:
base_path = os.path.dirname(__file__)
local_image_path = os.path.join(base_path, 'logo_image.png')
gr.HTML(create_html_image(local_image_path))
gr.Markdown("# ๐ฒ๐ฆ ๐ Moroccan Fast Speech-to-Text Transcription ๐")
gr.Markdown("โ ๏ธ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button")
gr.Markdown("๐ The **Large** model should be available soon. Stay tuned!")
with gr.Row():
model_choice = gr.Dropdown(
choices=["Nano", "Small", "Large"],
value="Small",
label="Select one of the models"
)
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record Audio",
)
with gr.Row():
save_data = gr.Checkbox(
label="Contribute to the evaluation benchmark",
value=True
)
submit_btn = gr.Button("Transcribe ๐ฅ")
output_text = gr.Textbox(label="Transcription")
gr.Markdown("""
### ๐๐ Notice to our dearest users ๐ค
- By transcribing your audio, youโre actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
- Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
- Together, weโre building tools that better understand and serve the unique linguistic landscape of Morocco.
- We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! ๐
""")
submit_btn.click(
fn=process_audio,
inputs=[audio_input, model_choice, save_data],
outputs=output_text
)
gr.Markdown("<br/>")
return app |