Garvitj commited on
Commit
d125e9a
·
verified ·
1 Parent(s): 1d52ab6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -17
app.py CHANGED
@@ -61,26 +61,45 @@ def preprocess_text(text):
61
  return ' '.join(lemmatized_tokens)
62
 
63
  # Extract features from audio
 
 
 
 
 
64
  def extract_features(data, sample_rate):
65
- result = np.array([])
66
- zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
67
- result = np.hstack((result, zcr))
68
-
69
- stft = np.abs(librosa.stft(data))
70
- chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
71
- result = np.hstack((result, chroma_stft))
72
-
73
- mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
74
- result = np.hstack((result, mfcc))
75
-
76
- rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
77
- result = np.hstack((result, rms))
78
-
79
- mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
80
- result = np.hstack((result, mel))
81
-
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return result
83
 
 
84
  # Predict emotion from text
85
  def find_emotion_using_text(sample_rate, audio_data, recognizer):
86
  mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
@@ -292,6 +311,9 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
292
  # Function to handle video processing and interaction
293
  def transcribe_and_predict_video(video, user_input, chat_history=[]):
294
  # Process the video for emotions (use your own emotion detection functions)
 
 
 
295
  image_emotion = process_video(video)
296
  text_emotion, audio_emotion,text = process_audio_from_video(video)
297
  em = [image_emotion, text_emotion, audio_emotion]
 
61
  return ' '.join(lemmatized_tokens)
62
 
63
  # Extract features from audio
64
+ import numpy as np
65
+ import torch
66
+ import torchaudio
67
+ import torchaudio.transforms as T
68
+
69
  def extract_features(data, sample_rate):
70
+ # List to collect all features
71
+ features = []
72
+
73
+ # Zero Crossing Rate (ZCR)
74
+ zcr = T.ZeroCrossingRate()(data)
75
+ features.append(torch.mean(zcr).numpy())
76
+
77
+ # Chroma Short-Time Fourier Transform (STFT)
78
+ stft = T.MelSpectrogram(sample_rate)(data)
79
+ chroma_stft = torch.mean(stft, dim=-1).numpy() # Take mean across the time dimension
80
+ features.append(chroma_stft)
81
+
82
+ # Mel Frequency Cepstral Coefficients (MFCC)
83
+ mfcc_transform = T.MFCC(sample_rate=sample_rate, melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23})
84
+ mfcc = mfcc_transform(data)
85
+ mfcc = torch.mean(mfcc, dim=-1).numpy() # Take mean across the time dimension
86
+ features.append(mfcc)
87
+
88
+ # Root Mean Square Energy (RMS)
89
+ rms = torch.mean(T.MelSpectrogram(sample_rate)(data), dim=-1) # Same as RMS feature extraction
90
+ features.append(rms.numpy())
91
+
92
+ # Mel Spectrogram
93
+ mel = T.MelSpectrogram(sample_rate)(data)
94
+ mel = torch.mean(mel, dim=-1).numpy() # Take mean across the time dimension
95
+ features.append(mel)
96
+
97
+ # Convert list of features to a single numpy array
98
+ result = np.hstack(features)
99
+
100
  return result
101
 
102
+
103
  # Predict emotion from text
104
  def find_emotion_using_text(sample_rate, audio_data, recognizer):
105
  mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
 
311
  # Function to handle video processing and interaction
312
  def transcribe_and_predict_video(video, user_input, chat_history=[]):
313
  # Process the video for emotions (use your own emotion detection functions)
314
+
315
+ if chat_history is None:
316
+ chat_history = []
317
  image_emotion = process_video(video)
318
  text_emotion, audio_emotion,text = process_audio_from_video(video)
319
  em = [image_emotion, text_emotion, audio_emotion]