ParthCodes commited on
Commit
aa0a778
·
verified ·
1 Parent(s): 02454e2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -9
app.py CHANGED
@@ -1,22 +1,323 @@
 
 
 
1
  import gradio as gr
2
  import cv2
 
 
 
 
 
 
 
 
 
3
  import moviepy.editor as mpe
4
- from moviepy.editor import VideoFileClip
5
 
6
- def process(video_path):
7
- print(video_path)
8
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  clip = mpe.VideoFileClip(video_path)
10
  clip.write_videofile('mp4file.mp4', fps=60)
11
-
12
  cap = cv2.VideoCapture('mp4file.mp4')
13
  fps = int(cap.get(cv2.CAP_PROP_FPS))
14
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
15
  interval = int(fps/2)
16
  print(interval, total_frames)
17
- return interval, total_frames
18
 
19
- demo = gr.Interface(fn=process, inputs=gr.Video(format='mp4'), outputs=["textbox", "textbox"], title="Video Frame Counter")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- if __name__ == "__main__":
22
- demo.launch()
 
1
+ import math
2
+ import os
3
+ from io import BytesIO
4
  import gradio as gr
5
  import cv2
6
+ from PIL import Image
7
+ import requests
8
+ from transformers import pipeline
9
+ from pydub import AudioSegment
10
+ from faster_whisper import WhisperModel
11
+ import joblib
12
+ import mediapipe as mp
13
+ import numpy as np
14
+ import pandas as pd
15
  import moviepy.editor as mpe
 
16
 
17
+ theme = gr.themes.Base(
18
+ primary_hue="cyan",
19
+ secondary_hue="blue",
20
+ neutral_hue="slate",
21
+ )
22
+
23
+ model = WhisperModel("small", device="cpu", compute_type="int8")
24
+
25
+ body_lang_model = joblib.load('body_language.pkl')
26
+
27
+ mp_holistic = mp.solutions.holistic
28
+ holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
29
+
30
+ mp_face_mesh = mp.solutions.face_mesh
31
+ face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)
32
+
33
+ API_KEY = os.getenv('HF_API_KEY')
34
+
35
+ pipe1 = pipeline("image-classification", model="dima806/facial_emotions_image_detection")
36
+ pipe2 = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions")
37
+ AUDIO_API_URL = "https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
38
+ headers = {"Authorization": "Bearer " + API_KEY + ""}
39
+
40
+ def extract_frames(video_path):
41
  clip = mpe.VideoFileClip(video_path)
42
  clip.write_videofile('mp4file.mp4', fps=60)
43
+
44
  cap = cv2.VideoCapture('mp4file.mp4')
45
  fps = int(cap.get(cv2.CAP_PROP_FPS))
46
  total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
47
  interval = int(fps/2)
48
  print(interval, total_frames)
 
49
 
50
+ result = []
51
+ distract_count = 0
52
+ total_count = 0
53
+ output_list = []
54
+
55
+ for i in range(0, total_frames, interval):
56
+ total_count += 1
57
+ cap.set(cv2.CAP_PROP_POS_FRAMES, i)
58
+ ret, frame = cap.read()
59
+
60
+ if ret:
61
+ image = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB)
62
+ image.flags.writeable = False
63
+ results = face_mesh.process(image)
64
+ image.flags.writeable = True
65
+ image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
66
+
67
+ img_h, img_w, img_c = image.shape
68
+ face_3d = []
69
+ face_2d = []
70
+
71
+ flag = False
72
+
73
+ if results.multi_face_landmarks:
74
+ for face_landmarks in results.multi_face_landmarks:
75
+ for idx, lm in enumerate(face_landmarks.landmark):
76
+ if idx == 33 or idx == 263 or idx == 1 or idx == 61 or idx == 291 or idx == 199:
77
+ if idx == 1:
78
+ nose_2d = (lm.x * img_w, lm.y * img_h)
79
+ nose_3d = (lm.x * img_w, lm.y * img_h, lm.z * 3000)
80
+
81
+ x, y = int(lm.x * img_w), int(lm.y * img_h)
82
+ face_2d.append([x, y])
83
+ face_3d.append([x, y, lm.z])
84
+ face_2d = np.array(face_2d, dtype=np.float64)
85
+ face_3d = np.array(face_3d, dtype=np.float64)
86
+ focal_length = 1 * img_w
87
+ cam_matrix = np.array([ [focal_length, 0, img_h / 2],
88
+ [0, focal_length, img_w / 2],
89
+ [0, 0, 1]])
90
+ dist_matrix = np.zeros((4, 1), dtype=np.float64)
91
+ success, rot_vec, trans_vec = cv2.solvePnP(face_3d, face_2d, cam_matrix, dist_matrix)
92
+ rmat, jac = cv2.Rodrigues(rot_vec)
93
+ angles, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rmat)
94
+ x = angles[0] * 360
95
+ y = angles[1] * 360
96
+ z = angles[2] * 360
97
+
98
+ if y < -7 or y > 7 or x < -7 or x > 7:
99
+ flag = True
100
+ else:
101
+ flag = False
102
+
103
+ if flag == True:
104
+ distract_count += 1
105
+
106
+ image2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
107
+ results2 = holistic.process(image2)
108
+
109
+ pose = results2.pose_landmarks.landmark
110
+ pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten())
111
+
112
+ face = results2.face_landmarks.landmark
113
+ face_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in face]).flatten())
114
+
115
+ row = pose_row+face_row
116
+
117
+ X = pd.DataFrame([row])
118
+ body_language_class = body_lang_model.predict(X)[0]
119
+ body_language_prob = body_lang_model.predict_proba(X)[0]
120
+
121
+ output_dict = {}
122
+ for class_name, prob in zip(body_lang_model.classes_, body_language_prob):
123
+ output_dict[class_name] = prob
124
+
125
+ output_list.append(output_dict)
126
+
127
+ pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
128
+ response = pipe1(pil_image)
129
+
130
+ temp = {}
131
+ for ele in response:
132
+ label, score = ele.values()
133
+ temp[label] = score
134
+ result.append(temp)
135
+
136
+ distraction_rate = distract_count/total_count
137
+
138
+ total_bad_prob = 0
139
+ total_good_prob = 0
140
+
141
+ for output_dict in output_list:
142
+ total_bad_prob += output_dict['Bad']
143
+ total_good_prob += output_dict['Good']
144
+
145
+ num_frames = len(output_list)
146
+ avg_bad_prob = total_bad_prob / num_frames
147
+ avg_good_prob = total_good_prob / num_frames
148
+
149
+ final_output = {'Bad': avg_bad_prob, 'Good': avg_good_prob}
150
+
151
+ cap.release()
152
+
153
+ video_emotion_totals = {}
154
+ emotion_totals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 }
155
+ counter = 0
156
+ for ele in result:
157
+ for emotion in ele.keys():
158
+ emotion_totals[emotion] += ele.get(emotion)
159
+ counter += 1
160
+
161
+ for emotion in emotion_totals:
162
+ emotion_totals[emotion] /= counter
163
+ if (emotion_totals[emotion]) > 0.0:
164
+ video_emotion_totals[emotion] = emotion_totals[emotion]
165
+
166
+ return video_emotion_totals, result, final_output, distraction_rate
167
+
168
+
169
+ def analyze_sentiment(text):
170
+ response = pipe2(text)
171
+ sentiment_results = {}
172
+ for ele in response:
173
+ label, score = ele.values()
174
+ sentiment_results[label] = score
175
+ return sentiment_results
176
+
177
+
178
+ def video_to_audio(input_video):
179
+
180
+ video_emotion_totals, frames_sentiments, body_language, distraction_rate = extract_frames(input_video)
181
+ print("Total Video Emotions ... Done")
182
+ print("Video Frame Sentiment ... Done")
183
+ print("Body Language ... Done")
184
+ print("Distraction Rate ... Done")
185
+
186
+ cap = cv2.VideoCapture(input_video)
187
+ fps = int(cap.get(cv2.CAP_PROP_FPS))
188
+ audio = AudioSegment.from_file(input_video)
189
+ audio_binary = audio.export(format="wav").read()
190
+ audio_bytesio = BytesIO(audio_binary)
191
+ audio_bytesio2 = BytesIO(audio_binary)
192
+
193
+ response = requests.post(AUDIO_API_URL, headers=headers, data=audio_bytesio)
194
+ formatted_response = {}
195
+ for ele in response.json():
196
+ score, label = ele.values()
197
+ formatted_response[label] = score
198
+
199
+ print("Speech Sentiments ... Done")
200
+
201
+ segments, info = model.transcribe(audio_bytesio2, beam_size=5)
202
+
203
+ transcript = ''
204
+ video_sentiment_final = []
205
+ final_output = []
206
+
207
+ for segment in segments:
208
+ transcript = transcript + segment.text + " "
209
+ transcript_segment_sentiment = analyze_sentiment(segment.text)
210
+
211
+ emotion_totals = {
212
+ 'admiration': 0.0,
213
+ 'amusement': 0.0,
214
+ 'angry': 0.0,
215
+ 'annoyance': 0.0,
216
+ 'approval': 0.0,
217
+ 'caring': 0.0,
218
+ 'confusion': 0.0,
219
+ 'curiosity': 0.0,
220
+ 'desire': 0.0,
221
+ 'disappointment': 0.0,
222
+ 'disapproval': 0.0,
223
+ 'disgust': 0.0,
224
+ 'embarrassment': 0.0,
225
+ 'excitement': 0.0,
226
+ 'fear': 0.0,
227
+ 'gratitude': 0.0,
228
+ 'grief': 0.0,
229
+ 'happy': 0.0,
230
+ 'love': 0.0,
231
+ 'nervousness': 0.0,
232
+ 'optimism': 0.0,
233
+ 'pride': 0.0,
234
+ 'realization': 0.0,
235
+ 'relief': 0.0,
236
+ 'remorse': 0.0,
237
+ 'sad': 0.0,
238
+ 'surprise': 0.0,
239
+ 'neutral': 0.0
240
+ }
241
+
242
+ counter = 0
243
+ for i in range(math.ceil(segment.start), math.floor(segment.end)):
244
+ for emotion in frames_sentiments[i].keys():
245
+ emotion_totals[emotion] += frames_sentiments[i].get(emotion)
246
+ counter += 1
247
+
248
+ for emotion in emotion_totals:
249
+ emotion_totals[emotion] /= counter
250
+
251
+ video_sentiment_final.append(emotion_totals)
252
+
253
+ video_segment_sentiment = {key: value for key, value in emotion_totals.items() if value != 0.0}
254
+
255
+ segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, video_segment_sentiment)}
256
+ final_output.append(segment_finals)
257
+
258
+ total_transcript_sentiment = {key: value for key, value in analyze_sentiment(transcript).items() if value >= 0.01}
259
+ print("Full Transcript Sentiments ... Done")
260
+
261
+ emotion_finals = {
262
+ 'admiration': 0.0,
263
+ 'amusement': 0.0,
264
+ 'angry': 0.0,
265
+ 'annoyance': 0.0,
266
+ 'approval': 0.0,
267
+ 'caring': 0.0,
268
+ 'confusion': 0.0,
269
+ 'curiosity': 0.0,
270
+ 'desire': 0.0,
271
+ 'disappointment': 0.0,
272
+ 'disapproval': 0.0,
273
+ 'disgust': 0.0,
274
+ 'embarrassment': 0.0,
275
+ 'excitement': 0.0,
276
+ 'fear': 0.0,
277
+ 'gratitude': 0.0,
278
+ 'grief': 0.0,
279
+ 'happy': 0.0,
280
+ 'love': 0.0,
281
+ 'nervousness': 0.0,
282
+ 'optimism': 0.0,
283
+ 'pride': 0.0,
284
+ 'realization': 0.0,
285
+ 'relief': 0.0,
286
+ 'remorse': 0.0,
287
+ 'sad': 0.0,
288
+ 'surprise': 0.0,
289
+ 'neutral': 0.0
290
+ }
291
+
292
+ for i in range(0, video_sentiment_final.__len__()-1):
293
+ for emotion in video_sentiment_final[i].keys():
294
+ emotion_finals[emotion] += video_sentiment_final[i].get(emotion)
295
+
296
+ for emotion in emotion_finals:
297
+ emotion_finals[emotion] /= video_sentiment_final.__len__()
298
+
299
+ emotion_finals = {key: value for key, value in emotion_finals.items() if value != 0.0}
300
+
301
+ print("Video Frame (Mapping & AVG.) ... Done")
302
+ print("\nProcessing Completed!!\n")
303
+
304
+ payload = {
305
+ 'from': 'gradio',
306
+ 'total_video_emotions': video_emotion_totals,
307
+ 'emotions_final': emotion_finals,
308
+ 'body_language': body_language,
309
+ 'distraction_rate': distraction_rate,
310
+ 'formatted_response': formatted_response,
311
+ 'total_transcript_sentiment': total_transcript_sentiment
312
+ }
313
+
314
+ print(payload)
315
+
316
+ response = requests.post('http://127.0.0.1:5000/interview', json=payload)
317
+
318
+
319
+ with gr.Blocks(theme=theme, css=".gradio-container { background: rgba(255, 255, 255, 0.2) !important; box-shadow: 0 8px 32px 0 rgba( 31, 38, 135, 0.37 ) !important; backdrop-filter: blur( 10px ) !important; -webkit-backdrop-filter: blur( 10px ) !important; border-radius: 10px !important; border: 1px solid rgba( 0, 0, 0, 0.5 ) !important;}") as Video:
320
+ input_video = gr.Video(sources=["upload", "webcam"], format='mp4')
321
+ input_video.stop_recording(fn=video_to_audio, inputs=input_video)
322
 
323
+ Video.launch()