Spaces:

sedemkofi
/

twi-transcription

Sleeping

App Files Files Community

sedemkofi commited on Dec 11, 2024

Commit

e2e2e8b

verified ·

1 Parent(s): 2e1e1ee

Update app.py

Browse files

Files changed (1) hide show

app.py +113 -55

app.py CHANGED Viewed

@@ -7,12 +7,48 @@ import json
 from io import BytesIO
 import base64
-# Set page config
-st.set_page_config(page_title="Twi Speech API")
-# Get request method and body
-request_method = st.experimental_get_query_params().get("_stcore_method", ["GET"])[0]
-request_json = st.experimental_get_query_params().get("_stcore_body", ["{}"])[0]
 @st.cache_resource
 def get_model():
@@ -26,51 +62,81 @@ def get_model():
                 model_data['max_length']
             )
     except Exception as e:
         return None
-def process_request():
     model = get_model()
     if model is None:
-        return {
             'error': 'Failed to load model',
             'status': 'error'
-        }
-    try:
-        # Parse request body
-        if request_method == "POST":
             try:
-                body = json.loads(request_json)
             except json.JSONDecodeError:
-                return {
                     'error': 'Invalid JSON data',
                     'status': 'error'
-                }
-        else:
-            return {
-                'error': 'Method not allowed. Use POST request.',
-                'status': 'error'
-            }
-        # Get audio data
-        audio_base64 = body.get('audio')
-        reference_text = body.get('reference_text')
-        if not audio_base64:
-            return {
-                'error': 'No audio data provided',
-                'status': 'error'
-            }
-        # Process audio
-        try:
-            audio_bytes = base64.b64decode(audio_base64)
-            audio_data, sr = librosa.load(BytesIO(audio_bytes), sr=None)
-            if len(audio_data.shape) > 1:
-                audio_data = np.mean(audio_data, axis=1)
             # Extract features
             mfcc_features = extract_mfcc(audio_data, sr)
             mfcc_features = np.expand_dims(mfcc_features, axis=0)
@@ -96,27 +162,19 @@ def process_request():
                         'word_error_rate': round(float(error_wer), 4),
                         'character_error_rate': round(float(error_cer), 4)
                     }
-            return response
         except Exception as e:
-            return {
-                'error': f'Error processing audio: {str(e)}',
                 'status': 'error'
-            }
-    except Exception as e:
-        return {
-            'error': str(e),
             'status': 'error'
-        }
-# Main execution
-if "api" in st.experimental_get_query_params():
-    response = process_request()
-    st.write(json.dumps(response))
-else:
-    st.write(json.dumps({
-        'error': 'Please use the API endpoint with ?api=true',
-        'status': 'error'
-    }))

 from io import BytesIO
 import base64
+class TwiTranscriptionModel:
+    def __init__(self, encoder_model, decoder_model, char_tokenizer, max_length=50):
+        self.encoder_model = encoder_model
+        self.decoder_model = decoder_model
+        self.char_tokenizer = char_tokenizer
+        self.max_length = max_length
+        self.sos_token = '<sos>'
+        self.eos_token = '<eos>'
+        self.sos_index = char_tokenizer.word_index[self.sos_token]
+        self.eos_index = char_tokenizer.word_index[self.eos_token]
+    def predict(self, audio_features):
+        batch_size = audio_features.shape[0]
+        transcriptions = []
+        for i in range(batch_size):
+            states_value = self.encoder_model.predict(
+                audio_features[i:i+1],
+                verbose=0
+            )
+            target_seq = np.array([[self.sos_index]])
+            decoded_chars = []
+            for _ in range(self.max_length):
+                output_tokens, h, c = self.decoder_model.predict(
+                    [target_seq] + states_value,
+                    verbose=0
+                )
+                sampled_token_index = np.argmax(output_tokens[0, -1, :])
+                sampled_char = self.char_tokenizer.index_word.get(sampled_token_index, '')
+                if sampled_char == self.eos_token or len(decoded_chars) > self.max_length:
+                    break
+                decoded_chars.append(sampled_char)
+                target_seq = np.array([[sampled_token_index]])
+                states_value = [h, c]
+            transcriptions.append(''.join(decoded_chars))
+        return transcriptions
 @st.cache_resource
 def get_model():
                 model_data['max_length']
             )
     except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
         return None
+def extract_mfcc(audio_data, sr=16000, n_mfcc=13):
+    if sr != 16000:
+        audio_data = librosa.resample(y=audio_data, orig_sr=sr, target_sr=16000)
+    mfcc = librosa.feature.mfcc(y=audio_data, sr=16000, n_mfcc=n_mfcc)
+    max_length = 1000  # Adjust based on your model's requirements
+    if mfcc.shape[1] > max_length:
+        mfcc = mfcc[:, :max_length]
+    else:
+        mfcc = np.pad(mfcc, ((0, 0), (0, max_length - mfcc.shape[1])), mode='constant')
+    return mfcc.T
+def calculate_error_rates(reference, hypothesis):
+    try:
+        error_wer = wer(reference, hypothesis)
+        error_cer = cer(reference, hypothesis)
+        return error_wer, error_cer
+    except Exception as e:
+        return None, None
+def process_audio_bytes(audio_bytes):
+    try:
+        audio_data, sr = librosa.load(BytesIO(audio_bytes), sr=None)
+        if len(audio_data.shape) > 1:
+            audio_data = np.mean(audio_data, axis=1)
+        return audio_data, sr
+    except Exception as e:
+        raise Exception(f"Error processing audio: {str(e)}")
+# Set page config
+st.set_page_config(page_title="Twi Speech API")
+def main():
     model = get_model()
     if model is None:
+        st.write(json.dumps({
             'error': 'Failed to load model',
             'status': 'error'
+        }))
+        return
+    # Check if this is an API request
+    if "api" in st.query_params:
+        try:
+            # Get the request body
+            body_data = st.query_params.get("data", "{}")
             try:
+                data = json.loads(body_data)
             except json.JSONDecodeError:
+                st.write(json.dumps({
                     'error': 'Invalid JSON data',
                     'status': 'error'
+                }))
+                return
+            audio_base64 = data.get('audio')
+            reference_text = data.get('reference_text')
+            if not audio_base64:
+                st.write(json.dumps({
+                    'error': 'No audio data provided',
+                    'status': 'error'
+                }))
+                return
+            # Process audio
+            audio_bytes = base64.b64decode(audio_base64)
+            audio_data, sr = process_audio_bytes(audio_bytes)
             # Extract features
             mfcc_features = extract_mfcc(audio_data, sr)
             mfcc_features = np.expand_dims(mfcc_features, axis=0)
                         'word_error_rate': round(float(error_wer), 4),
                         'character_error_rate': round(float(error_cer), 4)
                     }
+            st.write(json.dumps(response))
         except Exception as e:
+            st.write(json.dumps({
+                'error': str(e),
                 'status': 'error'
+            }))
+    else:
+        st.write(json.dumps({
+            'error': 'Please use the API endpoint with ?api=true',
             'status': 'error'
+        }))
+if __name__ == "__main__":
+    main()