Spaces:

sedemkofi
/

twi-transcription

Sleeping

App Files Files Community

sedemkofi commited on Dec 11, 2024

Commit

62a2e2b

verified ·

1 Parent(s): 346a962

Update app.py

Browse files

Files changed (1) hide show

app.py +101 -4

app.py CHANGED Viewed

@@ -6,10 +6,51 @@ from jiwer import wer, cer
 import json
 from io import BytesIO
 import base64
-# ... (keep your existing imports and TwiTranscriptionModel class) ...
-# Add this at the top of your file
 class ChunkedUploader:
     def __init__(self):
         if 'chunks' not in st.session_state:
@@ -38,10 +79,63 @@ class ChunkedUploader:
         del st.session_state.chunks[upload_id]
         return complete_data
 def main():
-    model = get_model()
-    chunked_uploader = ChunkedUploader()
     if model is None:
         st.write(json.dumps({
             'error': 'Failed to load model',
@@ -49,6 +143,9 @@ def main():
         }))
         return
     # Check if this is an API request
     if "api" in st.query_params:
         try:

 import json
 from io import BytesIO
 import base64
+import tensorflow as tf
+class TwiTranscriptionModel:
+    def __init__(self, encoder_model, decoder_model, char_tokenizer, max_length=50):
+        self.encoder_model = encoder_model
+        self.decoder_model = decoder_model
+        self.char_tokenizer = char_tokenizer
+        self.max_length = max_length
+        self.sos_token = '<sos>'
+        self.eos_token = '<eos>'
+        self.sos_index = char_tokenizer.word_index[self.sos_token]
+        self.eos_index = char_tokenizer.word_index[self.eos_token]
+    def predict(self, audio_features):
+        batch_size = audio_features.shape[0]
+        transcriptions = []
+        for i in range(batch_size):
+            states_value = self.encoder_model.predict(
+                audio_features[i:i+1],
+                verbose=0
+            )
+            target_seq = np.array([[self.sos_index]])
+            decoded_chars = []
+            for _ in range(self.max_length):
+                output_tokens, h, c = self.decoder_model.predict(
+                    [target_seq] + states_value,
+                    verbose=0
+                )
+                sampled_token_index = np.argmax(output_tokens[0, -1, :])
+                sampled_char = self.char_tokenizer.index_word.get(sampled_token_index, '')
+                if sampled_char == self.eos_token or len(decoded_chars) > self.max_length:
+                    break
+                decoded_chars.append(sampled_char)
+                target_seq = np.array([[sampled_token_index]])
+                states_value = [h, c]
+            transcriptions.append(''.join(decoded_chars))
+        return transcriptions
 class ChunkedUploader:
     def __init__(self):
         if 'chunks' not in st.session_state:
         del st.session_state.chunks[upload_id]
         return complete_data
+@st.cache_resource
+def get_model():
+    try:
+        model_path = 'twi_transcription_model.pkl'
+        st.write(f"Looking for model at: {model_path}")  # Debug info
+        with open(model_path, 'rb') as f:
+            model_data = pickle.load(f)
+        model = TwiTranscriptionModel(
+            model_data['encoder_model'],
+            model_data['decoder_model'],
+            model_data['char_tokenizer'],
+            model_data['max_length']
+        )
+        st.write("Model loaded successfully")  # Debug info
+        return model
+    except Exception as e:
+        st.error(f"Error loading model: {str(e)}")
+        return None
+def extract_mfcc(audio_data, sr=16000, n_mfcc=13):
+    if sr != 16000:
+        audio_data = librosa.resample(y=audio_data, orig_sr=sr, target_sr=16000)
+    mfcc = librosa.feature.mfcc(y=audio_data, sr=16000, n_mfcc=n_mfcc)
+    max_length = 1000  # Adjust based on your model's requirements
+    if mfcc.shape[1] > max_length:
+        mfcc = mfcc[:, :max_length]
+    else:
+        mfcc = np.pad(mfcc, ((0, 0), (0, max_length - mfcc.shape[1])), mode='constant')
+    return mfcc.T
+def calculate_error_rates(reference, hypothesis):
+    try:
+        error_wer = wer(reference, hypothesis)
+        error_cer = cer(reference, hypothesis)
+        return error_wer, error_cer
+    except Exception as e:
+        return None, None
+def process_audio_bytes(audio_bytes):
+    try:
+        audio_data, sr = librosa.load(BytesIO(audio_bytes), sr=None)
+        if len(audio_data.shape) > 1:
+            audio_data = np.mean(audio_data, axis=1)
+        return audio_data, sr
+    except Exception as e:
+        raise Exception(f"Error processing audio: {str(e)}")
 def main():
+    st.set_page_config(page_title="Twi Speech API")
+    # Initialize model
+    model = get_model()
     if model is None:
         st.write(json.dumps({
             'error': 'Failed to load model',
         }))
         return
+    # Initialize chunked uploader
+    chunked_uploader = ChunkedUploader()
     # Check if this is an API request
     if "api" in st.query_params:
         try: