Spaces:

Garvitj
/

emotion-llm

Sleeping

App Files Files Community

Garvitj commited on 14 days ago

Commit

d125e9a

verified ·

1 Parent(s): 1d52ab6

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -17

app.py CHANGED Viewed

@@ -61,26 +61,45 @@ def preprocess_text(text):
     return ' '.join(lemmatized_tokens)
 # Extract features from audio
 def extract_features(data, sample_rate):
-    result = np.array([])
-    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
-    result = np.hstack((result, zcr))
-    stft = np.abs(librosa.stft(data))
-    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
-    result = np.hstack((result, chroma_stft))
-    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
-    result = np.hstack((result, mfcc))
-    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
-    result = np.hstack((result, rms))
-    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
-    result = np.hstack((result, mel))
     return result
 # Predict emotion from text
 def find_emotion_using_text(sample_rate, audio_data, recognizer):
     mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
@@ -292,6 +311,9 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
 # Function to handle video processing and interaction
 def transcribe_and_predict_video(video, user_input, chat_history=[]):
     # Process the video for emotions (use your own emotion detection functions)
     image_emotion = process_video(video)
     text_emotion, audio_emotion,text = process_audio_from_video(video)
     em = [image_emotion, text_emotion, audio_emotion]

     return ' '.join(lemmatized_tokens)
 # Extract features from audio
+import numpy as np
+import torch
+import torchaudio
+import torchaudio.transforms as T
 def extract_features(data, sample_rate):
+    # List to collect all features
+    features = []
+    # Zero Crossing Rate (ZCR)
+    zcr = T.ZeroCrossingRate()(data)
+    features.append(torch.mean(zcr).numpy())
+    # Chroma Short-Time Fourier Transform (STFT)
+    stft = T.MelSpectrogram(sample_rate)(data)
+    chroma_stft = torch.mean(stft, dim=-1).numpy()  # Take mean across the time dimension
+    features.append(chroma_stft)
+    # Mel Frequency Cepstral Coefficients (MFCC)
+    mfcc_transform = T.MFCC(sample_rate=sample_rate, melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23})
+    mfcc = mfcc_transform(data)
+    mfcc = torch.mean(mfcc, dim=-1).numpy()  # Take mean across the time dimension
+    features.append(mfcc)
+    # Root Mean Square Energy (RMS)
+    rms = torch.mean(T.MelSpectrogram(sample_rate)(data), dim=-1)  # Same as RMS feature extraction
+    features.append(rms.numpy())
+    # Mel Spectrogram
+    mel = T.MelSpectrogram(sample_rate)(data)
+    mel = torch.mean(mel, dim=-1).numpy()  # Take mean across the time dimension
+    features.append(mel)
+    # Convert list of features to a single numpy array
+    result = np.hstack(features)
     return result
 # Predict emotion from text
 def find_emotion_using_text(sample_rate, audio_data, recognizer):
     mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
 # Function to handle video processing and interaction
 def transcribe_and_predict_video(video, user_input, chat_history=[]):
     # Process the video for emotions (use your own emotion detection functions)
+    if chat_history is None:
+        chat_history = []
     image_emotion = process_video(video)
     text_emotion, audio_emotion,text = process_audio_from_video(video)
     em = [image_emotion, text_emotion, audio_emotion]