metadata
license: apache-2.0
language:
- bn
base_model:
- openai/whisper-small
pipeline_tag: automatic-speech-recognition
BengaliRegionalASR trained on bengali regional dialact dataset. sha1779/Bengali_Regional_dataset
This model is trained on this barishal regional data only. The dataset is taken from ভাষা-বিচিত্রা: ASR for Regional Dialects competition.
Try the model
!pip install librosa torch torchaudio transformers
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
# MP3 URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"
# Download the MP3 file
print("Downloading audio file...")
response = requests.get(mp3_url)
if response.status_code == 200:
with open(local_audio_path, 'wb') as f:
f.write(response.content)
print("Download complete.")
else:
raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")
# Load and preprocess the audio
try:
print("Processing audio file...")
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
speech_array = librosa.resample(np.asarray(speech_array), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(speech_array, sampling_rate=16000, return_tensors="pt").input_features
# Generate transcription
print("Generating transcription...")
predicted_ids = model.generate(inputs=input_features.to(device))[0]
transcription = processor.decode(predicted_ids, skip_special_tokens=True)
# Print the transcription
print("Transcription:", transcription)
finally:
# Clean up: delete the temporary audio file
if os.path.exists(local_audio_path):
os.remove(local_audio_path)
print("Temporary audio file deleted.")
For larger audio , more than 30s
import os
import requests
import librosa
import torch
import numpy as np
from transformers import WhisperTokenizer, WhisperProcessor, WhisperFeatureExtractor, WhisperForConditionalGeneration
# Define model and device
model_path_ = "sha1779/BengaliRegionalASR"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
feature_extractor = WhisperFeatureExtractor.from_pretrained(model_path_)
tokenizer = WhisperTokenizer.from_pretrained(model_path_)
processor = WhisperProcessor.from_pretrained(model_path_)
model = WhisperForConditionalGeneration.from_pretrained(model_path_).to(device)
model.config.forced_decoder_ids = processor.get_decoder_prompt_ids(language="bengali", task="transcribe")
# Remote MP3 file URL
mp3_url = "https://huggingface.co/sha1779/BengaliRegionalASR/resolve/main/Mp3/valid_barishal%20(1).wav"
local_audio_path = "temp_audio.wav"
# Download the MP3 file
response = requests.get(mp3_url)
if response.status_code == 200:
with open(local_audio_path, 'wb') as f:
f.write(response.content)
else:
raise Exception(f"Failed to download file. HTTP status code: {response.status_code}")
# Load audio
speech_array, sampling_rate = librosa.load(local_audio_path, sr=16000)
# Define chunk parameters
chunk_duration = 30 # seconds
overlap = 5 # seconds
chunk_size = int(chunk_duration * sampling_rate)
overlap_size = int(overlap * sampling_rate)
# Split audio into chunks
chunks = [
speech_array[start : start + chunk_size]
for start in range(0, len(speech_array), chunk_size - overlap_size)
]
# Process and transcribe each chunk
transcriptions = []
for i, chunk in enumerate(chunks):
# Resample and extract features
chunk = librosa.resample(np.asarray(chunk), orig_sr=sampling_rate, target_sr=16000)
input_features = feature_extractor(chunk, sampling_rate=16000, return_tensors="pt").input_features
# Generate transcription
predicted_ids = model.generate(inputs=input_features.to(device))[0]
transcription = processor.decode(predicted_ids, skip_special_tokens=True)
transcriptions.append(transcription)
# Combine and print the transcriptions
print(" ".join(transcriptions))
# Clean up temporary file
os.remove(local_audio_path)
Evaluation
Word Error Rate 0.65 %