Garvitj commited on
Commit
4a21329
·
verified ·
1 Parent(s): cc320e4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +230 -53
app.py CHANGED
@@ -1,63 +1,240 @@
1
  import gradio as gr
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- import tempfile
4
  import numpy as np
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- # Initialize LLaMA Model for Question Answering
7
- llama_model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1")
8
- llama_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
9
 
10
- # Updated transcribe_and_predict_video function from your code
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def transcribe_and_predict_video(video):
12
- # Process video frames for image-based emotion recognition
13
  image_emotion = process_video(video)
14
-
15
- # Process audio for text and audio-based emotion recognition
16
  text_emotion, audio_emotion = process_audio_from_video(video)
17
-
18
- # Determine the overall emotion (could be based on majority vote or some other logic)
19
- overall_emotion = Counter([text_emotion, audio_emotion, image_emotion]).most_common(1)[0][0]
20
-
21
- return overall_emotion
22
-
23
- # Emotion-aware Question Answering with LLM
24
- def emotion_aware_qa(question, video):
25
- # Get the emotion from the video (this uses the emotion detection you already implemented)
26
- detected_emotion = transcribe_and_predict_video(video)
27
-
28
- # Create a custom response context based on the detected emotion
29
- if detected_emotion == 'joy':
30
- emotion_context = "You're in a good mood! Let's keep the positivity going."
31
- elif detected_emotion == 'sadness':
32
- emotion_context = "It seems like you're feeling a bit down. Let me help with that."
33
- elif detected_emotion == 'anger':
34
- emotion_context = "I sense some frustration. Let's work through it together."
35
- elif detected_emotion == 'fear':
36
- emotion_context = "It sounds like you're anxious. How can I assist in calming things down?"
37
- elif detected_emotion == 'neutral':
38
- emotion_context = "You're feeling neutral. How can I help you today?"
39
- else:
40
- emotion_context = "You're in an uncertain emotional state. Let me guide you."
41
-
42
- # Prepare the prompt for LLaMA, including emotion context and user question
43
- prompt = f"{emotion_context} User asks: {question}"
44
-
45
- # Tokenize and generate response from LLaMA
46
- inputs = llama_tokenizer(prompt, return_tensors="pt")
47
- outputs = llama_model.generate(inputs['input_ids'], max_length=150)
48
- answer = llama_tokenizer.decode(outputs[0], skip_special_tokens=True)
49
-
50
- return answer
51
-
52
- # Create Gradio interface to interact with the LLM and video emotion detection
53
- def gradio_interface(question, video):
54
- response = emotion_aware_qa(question, video)
55
- return response
56
-
57
- iface = gr.Interface(fn=gradio_interface,
58
- inputs=["text", gr.Video()],
59
  outputs="text",
60
- title="Emotion-Aware Question Answering",
61
- description="Ask a question and get an emotion-aware response based on the video.")
62
 
63
  iface.launch()
 
1
  import gradio as gr
 
 
2
  import numpy as np
3
+ import cv2
4
+ import librosa
5
+ import speech_recognition as sr
6
+ import tempfile
7
+ import wave
8
+ import os
9
+ import tensorflow as tf
10
+ from tensorflow.keras.preprocessing.text import tokenizer_from_json
11
+ from tensorflow.keras.models import load_model, model_from_json
12
+ from sklearn.preprocessing import StandardScaler
13
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
14
+ import nltk
15
+ from nltk.corpus import stopwords
16
+ from nltk.stem import WordNetLemmatizer
17
+ import pickle
18
+ import json
19
+ from tensorflow.keras.preprocessing.image import img_to_array, load_img
20
+ from collections import Counter
21
+ from pydub import AudioSegment
22
+ import ffmpeg
23
+
24
+ nltk.download('punkt') # Tokenizer
25
+ nltk.download('wordnet') # WordNet lemmatizer
26
+ nltk.download('stopwords') # Stopwords
27
+
28
+ # Load the text model
29
+ with open('model_architecture_for_text_emotion_updated_json.json', 'r') as json_file:
30
+ model_json = json_file.read()
31
+ text_model = model_from_json(model_json)
32
+ text_model.load_weights("model_for_text_emotion_updated(1).keras")
33
+
34
+ # Load the encoder and scaler for audio
35
+ with open('encoder.pkl', 'rb') as file:
36
+ encoder = pickle.load(file)
37
+ with open('scaler.pkl', 'rb') as file:
38
+ scaler = pickle.load(file)
39
+
40
+ # Load the tokenizer for text
41
+ with open('tokenizer.json') as json_file:
42
+ tokenizer_json = json.load(json_file)
43
+ tokenizer = tokenizer_from_json(tokenizer_json)
44
+
45
+ # Load the audio model
46
+ audio_model = load_model('my_model.h5')
47
+
48
+ # Load the image model
49
+ image_model = load_model('model_emotion.h5')
50
+
51
+ # Initialize NLTK
52
+ lemmatizer = WordNetLemmatizer()
53
+ stop_words = set(stopwords.words('english'))
54
+
55
+ # Preprocess text function
56
+ def preprocess_text(text):
57
+ tokens = nltk.word_tokenize(text.lower())
58
+ tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
59
+ lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
60
+ return ' '.join(lemmatized_tokens)
61
+
62
+ # Extract features from audio
63
+ def extract_features(data, sample_rate):
64
+ result = np.array([])
65
+ zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
66
+ result = np.hstack((result, zcr))
67
+
68
+ stft = np.abs(librosa.stft(data))
69
+ chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
70
+ result = np.hstack((result, chroma_stft))
71
+
72
+ mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
73
+ result = np.hstack((result, mfcc))
74
+
75
+ rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
76
+ result = np.hstack((result, rms))
77
+
78
+ mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
79
+ result = np.hstack((result, mel))
80
+
81
+ return result
82
+
83
+ # Predict emotion from text
84
+ def find_emotion_using_text(sample_rate, audio_data, recognizer):
85
+ mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
86
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio_file:
87
+ temp_audio_path = temp_audio_file.name
88
+
89
+ with wave.open(temp_audio_path, 'w') as wf:
90
+ wf.setnchannels(1)
91
+ wf.setsampwidth(2)
92
+ wf.setframerate(sample_rate)
93
+ wf.writeframes(audio_data.tobytes())
94
+
95
+ with sr.AudioFile(temp_audio_path) as source:
96
+ audio_record = recognizer.record(source)
97
+ text = recognizer.recognize_google(audio_record)
98
+ pre_text = preprocess_text(text)
99
+ title_seq = tokenizer.texts_to_sequences([pre_text])
100
+ padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
101
+ inp1 = np.array(padded_title_seq)
102
+ text_prediction = text_model.predict(inp1)
103
+
104
+ os.remove(temp_audio_path)
105
+ max_index = text_prediction.argmax()
106
+ return mapping[max_index]
107
+
108
+ # Predict emotion from audio
109
+ def predict_emotion(audio_data):
110
+ sample_rate, data = audio_data
111
+ data = data.flatten()
112
+
113
+ if data.dtype != np.float32:
114
+ data = data.astype(np.float32)
115
+ data = data / np.max(np.abs(data))
116
+
117
+ features = extract_features(data, sample_rate)
118
+ features = np.expand_dims(features, axis=0)
119
+
120
+ if features.ndim == 3:
121
+ features = np.squeeze(features, axis=2)
122
+ elif features.ndim != 2:
123
+ raise ValueError("Features array has unexpected dimensions.")
124
+
125
+ scaled_features = scaler.transform(features)
126
+ scaled_features = np.expand_dims(scaled_features, axis=2)
127
+
128
+ prediction = audio_model.predict(scaled_features)
129
+ emotion_index = np.argmax(prediction)
130
+
131
+ num_classes = len(encoder.categories_[0])
132
+ emotion_array = np.zeros((1, num_classes))
133
+ emotion_array[0, emotion_index] = 1
134
+
135
+ emotion_label = encoder.inverse_transform(emotion_array)[0]
136
+ return emotion_label
137
+
138
+ def preprocess_image(image):
139
+ image = load_img(image, target_size=(48, 48), color_mode="grayscale")
140
+ image = img_to_array(image)
141
+ image = np.expand_dims(image, axis=0)
142
+ image = image / 255.0
143
+ return image
144
+
145
+ # Predict emotion from image
146
+ def predict_emotion_from_image(image):
147
+ preprocessed_image = preprocess_image(image)
148
+ prediction = image_model.predict(preprocessed_image)
149
+ emotion_index = np.argmax(prediction)
150
+
151
+ mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
152
+ return mapping[emotion_index]
153
+
154
+ def process_video(video_path):
155
+ cap = cv2.VideoCapture(video_path)
156
+ frame_rate = cap.get(cv2.CAP_PROP_FPS)
157
+
158
+ frame_count = 0
159
+ predictions = []
160
+
161
+ while cap.isOpened():
162
+ ret, frame = cap.read()
163
+ if not ret:
164
+ break
165
+
166
+ # Process every nth frame (to speed up processing)
167
+ if frame_count % int(frame_rate) == 0:
168
+ # Convert frame to grayscale as required by your model
169
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
170
+ frame = cv2.resize(frame, (48, 48)) # Resize to match model input size
171
+ frame = img_to_array(frame)
172
+ frame = np.expand_dims(frame, axis=0) / 255.0
173
+
174
+ # Predict emotion
175
+ prediction = image_model.predict(frame)
176
+ predictions.append(np.argmax(prediction))
177
+
178
+ frame_count += 1
179
+
180
+ cap.release()
181
+ cv2.destroyAllWindows()
182
+
183
+ # Find the most common prediction
184
+ most_common_emotion = Counter(predictions).most_common(1)[0][0]
185
+ mapping = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}
186
+ return mapping[most_common_emotion]
187
+
188
+
189
 
190
+ # Process audio from video and predict emotions
191
+ def process_audio_from_video(video_path):
192
+ audio_path = video_path.replace(".mp4", ".wav")
193
 
194
+ try:
195
+ # Extract audio using FFmpeg
196
+ ffmpeg.input(video_path).output(audio_path, format='wav', acodec='pcm_s16le', ac=1, ar='16000').run(overwrite_output=True)
197
+
198
+ recognizer = sr.Recognizer()
199
+
200
+ with sr.AudioFile(audio_path) as source:
201
+ audio_record = recognizer.record(source)
202
+ text = recognizer.recognize_google(audio_record)
203
+ pre_text = preprocess_text(text)
204
+ title_seq = tokenizer.texts_to_sequences([pre_text])
205
+ padded_title_seq = pad_sequences(title_seq, maxlen=35, padding='post', truncating='post')
206
+ inp1 = np.array(padded_title_seq)
207
+ text_prediction = text_model.predict(inp1)
208
+
209
+ os.remove(audio_path)
210
+
211
+ max_index = text_prediction.argmax()
212
+ text_emotion = {0: "anger", 1: "disgust", 2: "fear", 3: "joy", 4: "neutral", 5: "sadness", 6: "surprise"}[max_index]
213
+
214
+ # Load audio with pydub for NumPy conversion
215
+ audio_segment = AudioSegment.from_wav(audio_path)
216
+ sound_array = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
217
+
218
+ # Predict emotion from audio
219
+ audio_emotion = predict_emotion((16000, sound_array))
220
+
221
+ except Exception as e:
222
+ print(f"Error processing audio: {e}")
223
+ audio_emotion = "Error in audio processing"
224
+
225
+ return text_emotion, audio_emotion
226
+
227
+ # Main function to handle video emotion recognition
228
  def transcribe_and_predict_video(video):
 
229
  image_emotion = process_video(video)
 
 
230
  text_emotion, audio_emotion = process_audio_from_video(video)
231
+ return f"Text Emotion: {text_emotion}, Audio Emotion: {audio_emotion}, Image Emotion: {image_emotion}"
232
+
233
+ # Create Gradio interface
234
+ iface = gr.Interface(fn=transcribe_and_predict_video,
235
+ inputs=gr.Video(),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
236
  outputs="text",
237
+ title="Multimodal Emotion Recognition from Video",
238
+ description="Upload a video to get text, audio, and image emotion predictions.")
239
 
240
  iface.launch()