Spaces:

sedemkofi
/

twi-transcription

Sleeping

App Files Files Community

sedemkofi commited on Dec 12, 2024

Commit

2df44cf

verified ·

1 Parent(s): 135910c

Update app.py

Browse files

Files changed (1) hide show

app.py +85 -104

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os
 import numpy as np
 import librosa
 import pickle
@@ -6,8 +6,6 @@ from jiwer import wer, cer
 import tensorflow as tf
 from io import BytesIO
 import soundfile as sf
-from flask import Flask, request, jsonify
-from werkzeug.utils import secure_filename
 class TwiTranscriptionModel:
     def __init__(self, encoder_model, decoder_model, char_tokenizer, max_length=50):
@@ -52,8 +50,10 @@ class TwiTranscriptionModel:
         return transcriptions
 def load_model():
     try:
         with open('twi_transcription_model.pkl', 'rb') as f:
             model_data = pickle.load(f)
             return TwiTranscriptionModel(
@@ -63,7 +63,7 @@ def load_model():
                 model_data['max_length']
             )
     except Exception as e:
-        print(f"Error loading model: {str(e)}")
         return None
 def extract_mfcc(audio_data, sr=16000, n_mfcc=13):
@@ -88,103 +88,84 @@ def calculate_error_rates(reference, hypothesis):
     except Exception as e:
         return None, None
-# Flask Application
-app = Flask(__name__)
-# Configure upload folder
-UPLOAD_FOLDER = 'uploads'
-ALLOWED_EXTENSIONS = {'wav', 'mp3', 'ogg'}
-app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
-os.makedirs(UPLOAD_FOLDER, exist_ok=True)
-# Load the model
-MODEL = load_model()
-def allowed_file(filename):
-    return '.' in filename and \
-           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
-@app.route('/transcribe', methods=['POST'])
-def transcribe_audio():
-    # Check if the model is loaded
-    if MODEL is None:
-        return jsonify({
-            'status': 'error',
-            'message': 'Speech recognition model not loaded'
-        }), 500
-    # Check if file is present
-    if 'audio' not in request.files:
-        return jsonify({
-            'status': 'error',
-            'message': 'No audio file uploaded'
-        }), 400
-    audio_file = request.files['audio']
-    # Check if filename is empty
-    if audio_file.filename == '':
-        return jsonify({
-            'status': 'error',
-            'message': 'No selected file'
-        }), 400
-    # Check if file is allowed
-    if not allowed_file(audio_file.filename):
-        return jsonify({
-            'status': 'error',
-            'message': 'Invalid file type. Allowed types: wav, mp3, ogg'
-        }), 400
-    try:
-        # Save file temporarily
-        filename = secure_filename(audio_file.filename)
-        filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
-        audio_file.save(filepath)
-        # Load audio file
-        audio_data, sr = librosa.load(filepath, sr=None)
-        if len(audio_data.shape) > 1:
-            audio_data = np.mean(audio_data, axis=1)
-        # Remove temporary file
-        os.remove(filepath)
-        # Extract features
-        mfcc_features = extract_mfcc(audio_data, sr)
-        mfcc_features = np.expand_dims(mfcc_features, axis=0)
-        # Get transcription
-        transcription = MODEL.predict(mfcc_features)[0]
-        # Optional reference text for error calculation
-        reference_text = request.form.get('reference_text', None)
-        response_data = {
-            'status': 'success',
-            'transcription': transcription,
-            'audio_details': {
-                'sample_rate': int(sr),
-                'duration': float(len(audio_data) / sr)
-            }
-        }
-        # Calculate error metrics if reference text is provided
-        if reference_text:
-            error_wer, error_cer = calculate_error_rates(reference_text, transcription)
-            if error_wer is not None and error_cer is not None:
-                response_data['error_metrics'] = {
-                    'word_error_rate': round(float(error_wer), 4),
-                    'character_error_rate': round(float(error_cer), 4)
-                }
-        return jsonify(response_data)
-    except Exception as e:
-        return jsonify({
-            'status': 'error',
-            'message': str(e)
-        }), 500
-if __name__ == '__main__':
-    app.run(debug=True)

+import streamlit as st
 import numpy as np
 import librosa
 import pickle
 import tensorflow as tf
 from io import BytesIO
 import soundfile as sf
 class TwiTranscriptionModel:
     def __init__(self, encoder_model, decoder_model, char_tokenizer, max_length=50):
         return transcriptions
+@st.cache_resource
 def load_model():
     try:
+        # Modify this path if your model is stored differently in Hugging Face
         with open('twi_transcription_model.pkl', 'rb') as f:
             model_data = pickle.load(f)
             return TwiTranscriptionModel(
                 model_data['max_length']
             )
     except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
         return None
 def extract_mfcc(audio_data, sr=16000, n_mfcc=13):
     except Exception as e:
         return None, None
+def main():
+    st.set_page_config(
+        page_title="Twi Speech Recognition",
+        page_icon="🎤",
+        layout="wide"
+    )
+    # Load the model
+    model = load_model()
+    if model is None:
+        st.error("Failed to load model. Please check model file.")
+        return
+    st.title("Twi Speech Transcription")
+    st.write("Upload an audio file to transcribe Twi speech")
+    # File uploader
+    audio_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'ogg'])
+    # Optional reference text
+    reference_text = st.text_area("Reference text (optional)",
+                                  help="Enter the correct transcription to calculate error rates")
+    if audio_file is not None:
+        if st.button("Transcribe"):
+            with st.spinner("Processing audio... This may take a moment."):
+                try:
+                    # Read audio file
+                    audio_data, sr = librosa.load(audio_file, sr=None)
+                    if len(audio_data.shape) > 1:
+                        audio_data = np.mean(audio_data, axis=1)
+                    # Extract features
+                    mfcc_features = extract_mfcc(audio_data, sr)
+                    mfcc_features = np.expand_dims(mfcc_features, axis=0)
+                    # Get transcription
+                    transcription = model.predict(mfcc_features)[0]
+                    # Display results
+                    st.success("Transcription completed!")
+                    # Audio Playback
+                    st.audio(audio_file, format='audio/wav')
+                    # Transcription Display
+                    st.write("### Transcription:")
+                    st.write(transcription)
+                    # Audio Details
+                    st.write("### Audio Details:")
+                    st.json({
+                        'sample_rate': int(sr),
+                        'duration': float(len(audio_data) / sr)
+                    })
+                    # Error Metrics (if reference text provided)
+                    if reference_text:
+                        error_wer, error_cer = calculate_error_rates(reference_text, transcription)
+                        if error_wer is not None and error_cer is not None:
+                            st.write("### Error Metrics:")
+                            st.json({
+                                'word_error_rate': round(float(error_wer), 4),
+                                'character_error_rate': round(float(error_cer), 4)
+                            })
+                except Exception as e:
+                    st.error(f"Error processing audio: {str(e)}")
+if __name__ == "__main__":
+    main()
+# Requirements for Hugging Face (create a requirements.txt)
+"""
+streamlit
+numpy
+librosa
+tensorflow
+jiwer
+soundfile
+"""