BounharAbdelaziz's picture
v0.1: remove create dataset, only load it.
3af0797 verified
raw
history blame
6.79 kB
import base64
import os
import gradio as gr
from transformers import pipeline
import numpy as np
import librosa
from datetime import datetime
from datasets import (
load_dataset,
concatenate_datasets,
Dataset,
DatasetDict,
Features,
Value,
Audio,
)
# Hugging Face evaluation dataset
HF_DATASET_NAME = "BounharAbdelaziz/Moroccan-STT-Eval-Dataset"
# Models paths
MODEL_PATHS = {
"NANO": "BounharAbdelaziz/Morocco-Darija-STT-tiny",
"SMALL": "BounharAbdelaziz/Morocco-Darija-STT-small",
"LARGE": "BounharAbdelaziz/Morocco-Darija-STT-large-v1.2",
}
# Access token to models
STT_MODEL_TOKEN = os.environ.get("STT_MODEL_TOKEN")
# Access token to dataset
STT_EVAL_DATASET_TOKEN = os.environ.get("STT_EVAL_DATASET_TOKEN")
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def encode_image_to_base64(image_path):
with open(image_path, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read()).decode()
return encoded_string
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def create_html_image(image_path):
img_base64 = encode_image_to_base64(image_path)
html_string = f"""
<div style="display: flex; justify-content: center; align-items: center; width: 100%; text-align: center;">
<div style="max-width: 800px; margin: auto;">
<img src="data:image/jpeg;base64,{img_base64}"
style="max-width: 75%; height: auto; display: block; margin: 0 auto; margin-top: 50px;"
alt="Displayed Image">
</div>
</div>
"""
return html_string
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def save_to_hf_dataset(audio_signal, model_choice, transcription):
print("[INFO] Loading dataset...")
dataset = load_dataset(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)
print("[INFO] Dataset loaded successfully.")
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
new_entry = {
"audio": [{"array": audio_signal, "sampling_rate": 16000}],
"transcription": [transcription],
"model_used": [model_choice],
"timestamp": [timestamp],
}
new_dataset = Dataset.from_dict(
new_entry,
features=Features({
"audio": Audio(sampling_rate=16000),
"transcription": Value("string"),
"model_used": Value("string"),
"timestamp": Value("string"),
})
)
print("[INFO] Adding the new entry to the dataset...")
train_dataset = dataset["train"]
updated_train_dataset = concatenate_datasets([train_dataset, new_dataset])
dataset["train"] = updated_train_dataset
print("[INFO] Pushing the updated dataset...")
dataset.push_to_hub(HF_DATASET_NAME, token=STT_EVAL_DATASET_TOKEN)
print("[INFO] Dataset updated and pushed successfully.")
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def load_model(model_name):
model_id = MODEL_PATHS[model_name.upper()]
return pipeline("automatic-speech-recognition", model=model_id, token=STT_MODEL_TOKEN)
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def process_audio(audio, model_choice, save_data):
pipe = load_model(model_choice)
audio_signal = audio[1]
sample_rate = audio[0]
audio_signal = audio_signal.astype(np.float32)
if np.abs(audio_signal).max() > 1.0:
audio_signal = audio_signal / 32768.0
if sample_rate != 16000:
print(f"[INFO] Resampling audio from {sample_rate}Hz to 16000Hz")
audio_signal = librosa.resample(
y=audio_signal,
orig_sr=sample_rate,
target_sr=16000
)
result = pipe(audio_signal)
transcription = result["text"]
if save_data:
print(f"[INFO] Saving data to eval dataset...")
save_to_hf_dataset(audio_signal, model_choice, transcription)
return transcription
# ---------------------------------------------------------------------------- #
# ---------------------------------------------------------------------------- #
def create_interface():
with gr.Blocks(css="footer{display:none !important}") as app:
base_path = os.path.dirname(__file__)
local_image_path = os.path.join(base_path, 'logo_image.png')
gr.HTML(create_html_image(local_image_path))
gr.Markdown("# ๐Ÿ‡ฒ๐Ÿ‡ฆ ๐Ÿš€ Moroccan Fast Speech-to-Text Transcription ๐Ÿ˜")
gr.Markdown("โš ๏ธ **Nota bene**: Make sure to click on **Stop** before hitting the **Transcribe** button")
gr.Markdown("๐Ÿ“Œ The **Large** model should be available soon. Stay tuned!")
with gr.Row():
model_choice = gr.Dropdown(
choices=["Nano", "Small", "Large"],
value="Small",
label="Select one of the models"
)
with gr.Row():
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record Audio",
)
with gr.Row():
save_data = gr.Checkbox(
label="Contribute to the evaluation benchmark",
value=True
)
submit_btn = gr.Button("Transcribe ๐Ÿ”ฅ")
output_text = gr.Textbox(label="Transcription")
gr.Markdown("""
### ๐Ÿ“„๐Ÿ“Œ Notice to our dearest users ๐Ÿค—
- By transcribing your audio, youโ€™re actively contributing to the development of a benchmark evaluation dataset for Moroccan speech-to-text models.
- Your transcriptions will be logged into a dedicated Hugging Face dataset, playing a crucial role in advancing research and innovation in speech recognition for Moroccan dialects and languages.
- Together, weโ€™re building tools that better understand and serve the unique linguistic landscape of Morocco.
- We count on your **thoughtfulness and responsibility** when using the app. Thank you for your contribution! ๐ŸŒŸ
""")
submit_btn.click(
fn=process_audio,
inputs=[audio_input, model_choice, save_data],
outputs=output_text
)
gr.Markdown("<br/>")
return app