sedemkofi commited on
Commit
2df44cf
·
verified ·
1 Parent(s): 135910c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -104
app.py CHANGED
@@ -1,4 +1,4 @@
1
- import os
2
  import numpy as np
3
  import librosa
4
  import pickle
@@ -6,8 +6,6 @@ from jiwer import wer, cer
6
  import tensorflow as tf
7
  from io import BytesIO
8
  import soundfile as sf
9
- from flask import Flask, request, jsonify
10
- from werkzeug.utils import secure_filename
11
 
12
  class TwiTranscriptionModel:
13
  def __init__(self, encoder_model, decoder_model, char_tokenizer, max_length=50):
@@ -52,8 +50,10 @@ class TwiTranscriptionModel:
52
 
53
  return transcriptions
54
 
 
55
  def load_model():
56
  try:
 
57
  with open('twi_transcription_model.pkl', 'rb') as f:
58
  model_data = pickle.load(f)
59
  return TwiTranscriptionModel(
@@ -63,7 +63,7 @@ def load_model():
63
  model_data['max_length']
64
  )
65
  except Exception as e:
66
- print(f"Error loading model: {str(e)}")
67
  return None
68
 
69
  def extract_mfcc(audio_data, sr=16000, n_mfcc=13):
@@ -88,103 +88,84 @@ def calculate_error_rates(reference, hypothesis):
88
  except Exception as e:
89
  return None, None
90
 
91
- # Flask Application
92
- app = Flask(__name__)
93
-
94
- # Configure upload folder
95
- UPLOAD_FOLDER = 'uploads'
96
- ALLOWED_EXTENSIONS = {'wav', 'mp3', 'ogg'}
97
-
98
- app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
99
- os.makedirs(UPLOAD_FOLDER, exist_ok=True)
100
-
101
- # Load the model
102
- MODEL = load_model()
103
-
104
- def allowed_file(filename):
105
- return '.' in filename and \
106
- filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
107
-
108
- @app.route('/transcribe', methods=['POST'])
109
- def transcribe_audio():
110
- # Check if the model is loaded
111
- if MODEL is None:
112
- return jsonify({
113
- 'status': 'error',
114
- 'message': 'Speech recognition model not loaded'
115
- }), 500
116
-
117
- # Check if file is present
118
- if 'audio' not in request.files:
119
- return jsonify({
120
- 'status': 'error',
121
- 'message': 'No audio file uploaded'
122
- }), 400
123
-
124
- audio_file = request.files['audio']
125
-
126
- # Check if filename is empty
127
- if audio_file.filename == '':
128
- return jsonify({
129
- 'status': 'error',
130
- 'message': 'No selected file'
131
- }), 400
132
-
133
- # Check if file is allowed
134
- if not allowed_file(audio_file.filename):
135
- return jsonify({
136
- 'status': 'error',
137
- 'message': 'Invalid file type. Allowed types: wav, mp3, ogg'
138
- }), 400
139
-
140
- try:
141
- # Save file temporarily
142
- filename = secure_filename(audio_file.filename)
143
- filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
144
- audio_file.save(filepath)
145
-
146
- # Load audio file
147
- audio_data, sr = librosa.load(filepath, sr=None)
148
- if len(audio_data.shape) > 1:
149
- audio_data = np.mean(audio_data, axis=1)
150
-
151
- # Remove temporary file
152
- os.remove(filepath)
153
-
154
- # Extract features
155
- mfcc_features = extract_mfcc(audio_data, sr)
156
- mfcc_features = np.expand_dims(mfcc_features, axis=0)
157
-
158
- # Get transcription
159
- transcription = MODEL.predict(mfcc_features)[0]
160
-
161
- # Optional reference text for error calculation
162
- reference_text = request.form.get('reference_text', None)
163
- response_data = {
164
- 'status': 'success',
165
- 'transcription': transcription,
166
- 'audio_details': {
167
- 'sample_rate': int(sr),
168
- 'duration': float(len(audio_data) / sr)
169
- }
170
- }
171
-
172
- # Calculate error metrics if reference text is provided
173
- if reference_text:
174
- error_wer, error_cer = calculate_error_rates(reference_text, transcription)
175
- if error_wer is not None and error_cer is not None:
176
- response_data['error_metrics'] = {
177
- 'word_error_rate': round(float(error_wer), 4),
178
- 'character_error_rate': round(float(error_cer), 4)
179
- }
180
-
181
- return jsonify(response_data)
182
-
183
- except Exception as e:
184
- return jsonify({
185
- 'status': 'error',
186
- 'message': str(e)
187
- }), 500
188
-
189
- if __name__ == '__main__':
190
- app.run(debug=True)
 
1
+ import streamlit as st
2
  import numpy as np
3
  import librosa
4
  import pickle
 
6
  import tensorflow as tf
7
  from io import BytesIO
8
  import soundfile as sf
 
 
9
 
10
  class TwiTranscriptionModel:
11
  def __init__(self, encoder_model, decoder_model, char_tokenizer, max_length=50):
 
50
 
51
  return transcriptions
52
 
53
+ @st.cache_resource
54
  def load_model():
55
  try:
56
+ # Modify this path if your model is stored differently in Hugging Face
57
  with open('twi_transcription_model.pkl', 'rb') as f:
58
  model_data = pickle.load(f)
59
  return TwiTranscriptionModel(
 
63
  model_data['max_length']
64
  )
65
  except Exception as e:
66
+ st.error(f"Error loading model: {str(e)}")
67
  return None
68
 
69
  def extract_mfcc(audio_data, sr=16000, n_mfcc=13):
 
88
  except Exception as e:
89
  return None, None
90
 
91
+ def main():
92
+ st.set_page_config(
93
+ page_title="Twi Speech Recognition",
94
+ page_icon="🎤",
95
+ layout="wide"
96
+ )
97
+
98
+ # Load the model
99
+ model = load_model()
100
+ if model is None:
101
+ st.error("Failed to load model. Please check model file.")
102
+ return
103
+
104
+ st.title("Twi Speech Transcription")
105
+ st.write("Upload an audio file to transcribe Twi speech")
106
+
107
+ # File uploader
108
+ audio_file = st.file_uploader("Choose an audio file", type=['wav', 'mp3', 'ogg'])
109
+
110
+ # Optional reference text
111
+ reference_text = st.text_area("Reference text (optional)",
112
+ help="Enter the correct transcription to calculate error rates")
113
+
114
+ if audio_file is not None:
115
+ if st.button("Transcribe"):
116
+ with st.spinner("Processing audio... This may take a moment."):
117
+ try:
118
+ # Read audio file
119
+ audio_data, sr = librosa.load(audio_file, sr=None)
120
+ if len(audio_data.shape) > 1:
121
+ audio_data = np.mean(audio_data, axis=1)
122
+
123
+ # Extract features
124
+ mfcc_features = extract_mfcc(audio_data, sr)
125
+ mfcc_features = np.expand_dims(mfcc_features, axis=0)
126
+
127
+ # Get transcription
128
+ transcription = model.predict(mfcc_features)[0]
129
+
130
+ # Display results
131
+ st.success("Transcription completed!")
132
+
133
+ # Audio Playback
134
+ st.audio(audio_file, format='audio/wav')
135
+
136
+ # Transcription Display
137
+ st.write("### Transcription:")
138
+ st.write(transcription)
139
+
140
+ # Audio Details
141
+ st.write("### Audio Details:")
142
+ st.json({
143
+ 'sample_rate': int(sr),
144
+ 'duration': float(len(audio_data) / sr)
145
+ })
146
+
147
+ # Error Metrics (if reference text provided)
148
+ if reference_text:
149
+ error_wer, error_cer = calculate_error_rates(reference_text, transcription)
150
+ if error_wer is not None and error_cer is not None:
151
+ st.write("### Error Metrics:")
152
+ st.json({
153
+ 'word_error_rate': round(float(error_wer), 4),
154
+ 'character_error_rate': round(float(error_cer), 4)
155
+ })
156
+
157
+ except Exception as e:
158
+ st.error(f"Error processing audio: {str(e)}")
159
+
160
+ if __name__ == "__main__":
161
+ main()
162
+
163
+ # Requirements for Hugging Face (create a requirements.txt)
164
+ """
165
+ streamlit
166
+ numpy
167
+ librosa
168
+ tensorflow
169
+ jiwer
170
+ soundfile
171
+ """