Browse files
@@ -61,26 +61,45 @@ def preprocess_text(text):
61 |
return ' '.join(lemmatized_tokens)
62 |
63 |
# Extract features from audio
64 |
def extract_features(data, sample_rate):
65 |
66 |
67 |
68 |
69 |
70 |
71 |
72 |
73 |
74 |
75 |
76 |
77 |
78 |
79 |
80 |
81 |
82 |
return result
83 |
84 |
# Predict emotion from text
85 |
def find_emotion_using_text(sample_rate, audio_data, recognizer):
86 |
mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
@@ -292,6 +311,9 @@ def respond(message, history: list[tuple[str, str]], system_message, max_tokens,
292 |
# Function to handle video processing and interaction
293 |
def transcribe_and_predict_video(video, user_input, chat_history=[]):
294 |
# Process the video for emotions (use your own emotion detection functions)
295 |
image_emotion = process_video(video)
296 |
text_emotion, audio_emotion,text = process_audio_from_video(video)
297 |
em = [image_emotion, text_emotion, audio_emotion]
61 |
return ' '.join(lemmatized_tokens)
62 |
63 |
# Extract features from audio
64 |
import numpy as np
65 |
import torch
66 |
import torchaudio
67 |
import torchaudio.transforms as T
68 |
69 |
def extract_features(data, sample_rate):
70 |
# List to collect all features
71 |
features = []
72 |
73 |
# Zero Crossing Rate (ZCR)
74 |
zcr = T.ZeroCrossingRate()(data)
75 |
76 |
77 |
# Chroma Short-Time Fourier Transform (STFT)
78 |
stft = T.MelSpectrogram(sample_rate)(data)
79 |
chroma_stft = torch.mean(stft, dim=-1).numpy() # Take mean across the time dimension
80 |
81 |
82 |
# Mel Frequency Cepstral Coefficients (MFCC)
83 |
mfcc_transform = T.MFCC(sample_rate=sample_rate, melkwargs={"n_fft": 400, "hop_length": 160, "n_mels": 23})
84 |
mfcc = mfcc_transform(data)
85 |
mfcc = torch.mean(mfcc, dim=-1).numpy() # Take mean across the time dimension
86 |
87 |
88 |
# Root Mean Square Energy (RMS)
89 |
rms = torch.mean(T.MelSpectrogram(sample_rate)(data), dim=-1) # Same as RMS feature extraction
90 |
91 |
92 |
# Mel Spectrogram
93 |
mel = T.MelSpectrogram(sample_rate)(data)
94 |
mel = torch.mean(mel, dim=-1).numpy() # Take mean across the time dimension
95 |
96 |
97 |
# Convert list of features to a single numpy array
98 |
result = np.hstack(features)
99 |
100 |
return result
101 |
102 |
103 |
# Predict emotion from text
104 |
def find_emotion_using_text(sample_rate, audio_data, recognizer):
105 |
mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
311 |
# Function to handle video processing and interaction
312 |
def transcribe_and_predict_video(video, user_input, chat_history=[]):
313 |
# Process the video for emotions (use your own emotion detection functions)
314 |
315 |
if chat_history is None:
316 |
chat_history = []
317 |
image_emotion = process_video(video)
318 |
text_emotion, audio_emotion,text = process_audio_from_video(video)
319 |
em = [image_emotion, text_emotion, audio_emotion]