Spaces:
Runtime error
Runtime error
ParthCodes
commited on
Update app.py
Browse files
app.py
CHANGED
@@ -1,22 +1,323 @@
|
|
|
|
|
|
|
|
1 |
import gradio as gr
|
2 |
import cv2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
import moviepy.editor as mpe
|
4 |
-
from moviepy.editor import VideoFileClip
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
clip = mpe.VideoFileClip(video_path)
|
10 |
clip.write_videofile('mp4file.mp4', fps=60)
|
11 |
-
|
12 |
cap = cv2.VideoCapture('mp4file.mp4')
|
13 |
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
14 |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
15 |
interval = int(fps/2)
|
16 |
print(interval, total_frames)
|
17 |
-
return interval, total_frames
|
18 |
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
|
22 |
-
demo.launch()
|
|
|
1 |
+
import math
|
2 |
+
import os
|
3 |
+
from io import BytesIO
|
4 |
import gradio as gr
|
5 |
import cv2
|
6 |
+
from PIL import Image
|
7 |
+
import requests
|
8 |
+
from transformers import pipeline
|
9 |
+
from pydub import AudioSegment
|
10 |
+
from faster_whisper import WhisperModel
|
11 |
+
import joblib
|
12 |
+
import mediapipe as mp
|
13 |
+
import numpy as np
|
14 |
+
import pandas as pd
|
15 |
import moviepy.editor as mpe
|
|
|
16 |
|
17 |
+
theme = gr.themes.Base(
|
18 |
+
primary_hue="cyan",
|
19 |
+
secondary_hue="blue",
|
20 |
+
neutral_hue="slate",
|
21 |
+
)
|
22 |
+
|
23 |
+
model = WhisperModel("small", device="cpu", compute_type="int8")
|
24 |
+
|
25 |
+
body_lang_model = joblib.load('body_language.pkl')
|
26 |
+
|
27 |
+
mp_holistic = mp.solutions.holistic
|
28 |
+
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
|
29 |
+
|
30 |
+
mp_face_mesh = mp.solutions.face_mesh
|
31 |
+
face_mesh = mp_face_mesh.FaceMesh(min_detection_confidence=0.5, min_tracking_confidence=0.5)
|
32 |
+
|
33 |
+
API_KEY = os.getenv('HF_API_KEY')
|
34 |
+
|
35 |
+
pipe1 = pipeline("image-classification", model="dima806/facial_emotions_image_detection")
|
36 |
+
pipe2 = pipeline("text-classification", model="SamLowe/roberta-base-go_emotions")
|
37 |
+
AUDIO_API_URL = "https://api-inference.huggingface.co/models/ehcalabres/wav2vec2-lg-xlsr-en-speech-emotion-recognition"
|
38 |
+
headers = {"Authorization": "Bearer " + API_KEY + ""}
|
39 |
+
|
40 |
+
def extract_frames(video_path):
|
41 |
clip = mpe.VideoFileClip(video_path)
|
42 |
clip.write_videofile('mp4file.mp4', fps=60)
|
43 |
+
|
44 |
cap = cv2.VideoCapture('mp4file.mp4')
|
45 |
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
46 |
total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
|
47 |
interval = int(fps/2)
|
48 |
print(interval, total_frames)
|
|
|
49 |
|
50 |
+
result = []
|
51 |
+
distract_count = 0
|
52 |
+
total_count = 0
|
53 |
+
output_list = []
|
54 |
+
|
55 |
+
for i in range(0, total_frames, interval):
|
56 |
+
total_count += 1
|
57 |
+
cap.set(cv2.CAP_PROP_POS_FRAMES, i)
|
58 |
+
ret, frame = cap.read()
|
59 |
+
|
60 |
+
if ret:
|
61 |
+
image = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB)
|
62 |
+
image.flags.writeable = False
|
63 |
+
results = face_mesh.process(image)
|
64 |
+
image.flags.writeable = True
|
65 |
+
image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
|
66 |
+
|
67 |
+
img_h, img_w, img_c = image.shape
|
68 |
+
face_3d = []
|
69 |
+
face_2d = []
|
70 |
+
|
71 |
+
flag = False
|
72 |
+
|
73 |
+
if results.multi_face_landmarks:
|
74 |
+
for face_landmarks in results.multi_face_landmarks:
|
75 |
+
for idx, lm in enumerate(face_landmarks.landmark):
|
76 |
+
if idx == 33 or idx == 263 or idx == 1 or idx == 61 or idx == 291 or idx == 199:
|
77 |
+
if idx == 1:
|
78 |
+
nose_2d = (lm.x * img_w, lm.y * img_h)
|
79 |
+
nose_3d = (lm.x * img_w, lm.y * img_h, lm.z * 3000)
|
80 |
+
|
81 |
+
x, y = int(lm.x * img_w), int(lm.y * img_h)
|
82 |
+
face_2d.append([x, y])
|
83 |
+
face_3d.append([x, y, lm.z])
|
84 |
+
face_2d = np.array(face_2d, dtype=np.float64)
|
85 |
+
face_3d = np.array(face_3d, dtype=np.float64)
|
86 |
+
focal_length = 1 * img_w
|
87 |
+
cam_matrix = np.array([ [focal_length, 0, img_h / 2],
|
88 |
+
[0, focal_length, img_w / 2],
|
89 |
+
[0, 0, 1]])
|
90 |
+
dist_matrix = np.zeros((4, 1), dtype=np.float64)
|
91 |
+
success, rot_vec, trans_vec = cv2.solvePnP(face_3d, face_2d, cam_matrix, dist_matrix)
|
92 |
+
rmat, jac = cv2.Rodrigues(rot_vec)
|
93 |
+
angles, mtxR, mtxQ, Qx, Qy, Qz = cv2.RQDecomp3x3(rmat)
|
94 |
+
x = angles[0] * 360
|
95 |
+
y = angles[1] * 360
|
96 |
+
z = angles[2] * 360
|
97 |
+
|
98 |
+
if y < -7 or y > 7 or x < -7 or x > 7:
|
99 |
+
flag = True
|
100 |
+
else:
|
101 |
+
flag = False
|
102 |
+
|
103 |
+
if flag == True:
|
104 |
+
distract_count += 1
|
105 |
+
|
106 |
+
image2 = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
107 |
+
results2 = holistic.process(image2)
|
108 |
+
|
109 |
+
pose = results2.pose_landmarks.landmark
|
110 |
+
pose_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in pose]).flatten())
|
111 |
+
|
112 |
+
face = results2.face_landmarks.landmark
|
113 |
+
face_row = list(np.array([[landmark.x, landmark.y, landmark.z, landmark.visibility] for landmark in face]).flatten())
|
114 |
+
|
115 |
+
row = pose_row+face_row
|
116 |
+
|
117 |
+
X = pd.DataFrame([row])
|
118 |
+
body_language_class = body_lang_model.predict(X)[0]
|
119 |
+
body_language_prob = body_lang_model.predict_proba(X)[0]
|
120 |
+
|
121 |
+
output_dict = {}
|
122 |
+
for class_name, prob in zip(body_lang_model.classes_, body_language_prob):
|
123 |
+
output_dict[class_name] = prob
|
124 |
+
|
125 |
+
output_list.append(output_dict)
|
126 |
+
|
127 |
+
pil_image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
|
128 |
+
response = pipe1(pil_image)
|
129 |
+
|
130 |
+
temp = {}
|
131 |
+
for ele in response:
|
132 |
+
label, score = ele.values()
|
133 |
+
temp[label] = score
|
134 |
+
result.append(temp)
|
135 |
+
|
136 |
+
distraction_rate = distract_count/total_count
|
137 |
+
|
138 |
+
total_bad_prob = 0
|
139 |
+
total_good_prob = 0
|
140 |
+
|
141 |
+
for output_dict in output_list:
|
142 |
+
total_bad_prob += output_dict['Bad']
|
143 |
+
total_good_prob += output_dict['Good']
|
144 |
+
|
145 |
+
num_frames = len(output_list)
|
146 |
+
avg_bad_prob = total_bad_prob / num_frames
|
147 |
+
avg_good_prob = total_good_prob / num_frames
|
148 |
+
|
149 |
+
final_output = {'Bad': avg_bad_prob, 'Good': avg_good_prob}
|
150 |
+
|
151 |
+
cap.release()
|
152 |
+
|
153 |
+
video_emotion_totals = {}
|
154 |
+
emotion_totals = { 'admiration': 0.0, 'amusement': 0.0, 'angry': 0.0, 'annoyance': 0.0, 'approval': 0.0, 'caring': 0.0, 'confusion': 0.0, 'curiosity': 0.0, 'desire': 0.0, 'disappointment': 0.0, 'disapproval': 0.0, 'disgust': 0.0, 'embarrassment': 0.0, 'excitement': 0.0, 'fear': 0.0, 'gratitude': 0.0, 'grief': 0.0, 'happy': 0.0, 'love': 0.0, 'nervousness': 0.0, 'optimism': 0.0, 'pride': 0.0, 'realization': 0.0, 'relief': 0.0, 'remorse': 0.0, 'sad': 0.0, 'surprise': 0.0, 'neutral': 0.0 }
|
155 |
+
counter = 0
|
156 |
+
for ele in result:
|
157 |
+
for emotion in ele.keys():
|
158 |
+
emotion_totals[emotion] += ele.get(emotion)
|
159 |
+
counter += 1
|
160 |
+
|
161 |
+
for emotion in emotion_totals:
|
162 |
+
emotion_totals[emotion] /= counter
|
163 |
+
if (emotion_totals[emotion]) > 0.0:
|
164 |
+
video_emotion_totals[emotion] = emotion_totals[emotion]
|
165 |
+
|
166 |
+
return video_emotion_totals, result, final_output, distraction_rate
|
167 |
+
|
168 |
+
|
169 |
+
def analyze_sentiment(text):
|
170 |
+
response = pipe2(text)
|
171 |
+
sentiment_results = {}
|
172 |
+
for ele in response:
|
173 |
+
label, score = ele.values()
|
174 |
+
sentiment_results[label] = score
|
175 |
+
return sentiment_results
|
176 |
+
|
177 |
+
|
178 |
+
def video_to_audio(input_video):
|
179 |
+
|
180 |
+
video_emotion_totals, frames_sentiments, body_language, distraction_rate = extract_frames(input_video)
|
181 |
+
print("Total Video Emotions ... Done")
|
182 |
+
print("Video Frame Sentiment ... Done")
|
183 |
+
print("Body Language ... Done")
|
184 |
+
print("Distraction Rate ... Done")
|
185 |
+
|
186 |
+
cap = cv2.VideoCapture(input_video)
|
187 |
+
fps = int(cap.get(cv2.CAP_PROP_FPS))
|
188 |
+
audio = AudioSegment.from_file(input_video)
|
189 |
+
audio_binary = audio.export(format="wav").read()
|
190 |
+
audio_bytesio = BytesIO(audio_binary)
|
191 |
+
audio_bytesio2 = BytesIO(audio_binary)
|
192 |
+
|
193 |
+
response = requests.post(AUDIO_API_URL, headers=headers, data=audio_bytesio)
|
194 |
+
formatted_response = {}
|
195 |
+
for ele in response.json():
|
196 |
+
score, label = ele.values()
|
197 |
+
formatted_response[label] = score
|
198 |
+
|
199 |
+
print("Speech Sentiments ... Done")
|
200 |
+
|
201 |
+
segments, info = model.transcribe(audio_bytesio2, beam_size=5)
|
202 |
+
|
203 |
+
transcript = ''
|
204 |
+
video_sentiment_final = []
|
205 |
+
final_output = []
|
206 |
+
|
207 |
+
for segment in segments:
|
208 |
+
transcript = transcript + segment.text + " "
|
209 |
+
transcript_segment_sentiment = analyze_sentiment(segment.text)
|
210 |
+
|
211 |
+
emotion_totals = {
|
212 |
+
'admiration': 0.0,
|
213 |
+
'amusement': 0.0,
|
214 |
+
'angry': 0.0,
|
215 |
+
'annoyance': 0.0,
|
216 |
+
'approval': 0.0,
|
217 |
+
'caring': 0.0,
|
218 |
+
'confusion': 0.0,
|
219 |
+
'curiosity': 0.0,
|
220 |
+
'desire': 0.0,
|
221 |
+
'disappointment': 0.0,
|
222 |
+
'disapproval': 0.0,
|
223 |
+
'disgust': 0.0,
|
224 |
+
'embarrassment': 0.0,
|
225 |
+
'excitement': 0.0,
|
226 |
+
'fear': 0.0,
|
227 |
+
'gratitude': 0.0,
|
228 |
+
'grief': 0.0,
|
229 |
+
'happy': 0.0,
|
230 |
+
'love': 0.0,
|
231 |
+
'nervousness': 0.0,
|
232 |
+
'optimism': 0.0,
|
233 |
+
'pride': 0.0,
|
234 |
+
'realization': 0.0,
|
235 |
+
'relief': 0.0,
|
236 |
+
'remorse': 0.0,
|
237 |
+
'sad': 0.0,
|
238 |
+
'surprise': 0.0,
|
239 |
+
'neutral': 0.0
|
240 |
+
}
|
241 |
+
|
242 |
+
counter = 0
|
243 |
+
for i in range(math.ceil(segment.start), math.floor(segment.end)):
|
244 |
+
for emotion in frames_sentiments[i].keys():
|
245 |
+
emotion_totals[emotion] += frames_sentiments[i].get(emotion)
|
246 |
+
counter += 1
|
247 |
+
|
248 |
+
for emotion in emotion_totals:
|
249 |
+
emotion_totals[emotion] /= counter
|
250 |
+
|
251 |
+
video_sentiment_final.append(emotion_totals)
|
252 |
+
|
253 |
+
video_segment_sentiment = {key: value for key, value in emotion_totals.items() if value != 0.0}
|
254 |
+
|
255 |
+
segment_finals = {segment.id: (segment.text, segment.start, segment.end, transcript_segment_sentiment, video_segment_sentiment)}
|
256 |
+
final_output.append(segment_finals)
|
257 |
+
|
258 |
+
total_transcript_sentiment = {key: value for key, value in analyze_sentiment(transcript).items() if value >= 0.01}
|
259 |
+
print("Full Transcript Sentiments ... Done")
|
260 |
+
|
261 |
+
emotion_finals = {
|
262 |
+
'admiration': 0.0,
|
263 |
+
'amusement': 0.0,
|
264 |
+
'angry': 0.0,
|
265 |
+
'annoyance': 0.0,
|
266 |
+
'approval': 0.0,
|
267 |
+
'caring': 0.0,
|
268 |
+
'confusion': 0.0,
|
269 |
+
'curiosity': 0.0,
|
270 |
+
'desire': 0.0,
|
271 |
+
'disappointment': 0.0,
|
272 |
+
'disapproval': 0.0,
|
273 |
+
'disgust': 0.0,
|
274 |
+
'embarrassment': 0.0,
|
275 |
+
'excitement': 0.0,
|
276 |
+
'fear': 0.0,
|
277 |
+
'gratitude': 0.0,
|
278 |
+
'grief': 0.0,
|
279 |
+
'happy': 0.0,
|
280 |
+
'love': 0.0,
|
281 |
+
'nervousness': 0.0,
|
282 |
+
'optimism': 0.0,
|
283 |
+
'pride': 0.0,
|
284 |
+
'realization': 0.0,
|
285 |
+
'relief': 0.0,
|
286 |
+
'remorse': 0.0,
|
287 |
+
'sad': 0.0,
|
288 |
+
'surprise': 0.0,
|
289 |
+
'neutral': 0.0
|
290 |
+
}
|
291 |
+
|
292 |
+
for i in range(0, video_sentiment_final.__len__()-1):
|
293 |
+
for emotion in video_sentiment_final[i].keys():
|
294 |
+
emotion_finals[emotion] += video_sentiment_final[i].get(emotion)
|
295 |
+
|
296 |
+
for emotion in emotion_finals:
|
297 |
+
emotion_finals[emotion] /= video_sentiment_final.__len__()
|
298 |
+
|
299 |
+
emotion_finals = {key: value for key, value in emotion_finals.items() if value != 0.0}
|
300 |
+
|
301 |
+
print("Video Frame (Mapping & AVG.) ... Done")
|
302 |
+
print("\nProcessing Completed!!\n")
|
303 |
+
|
304 |
+
payload = {
|
305 |
+
'from': 'gradio',
|
306 |
+
'total_video_emotions': video_emotion_totals,
|
307 |
+
'emotions_final': emotion_finals,
|
308 |
+
'body_language': body_language,
|
309 |
+
'distraction_rate': distraction_rate,
|
310 |
+
'formatted_response': formatted_response,
|
311 |
+
'total_transcript_sentiment': total_transcript_sentiment
|
312 |
+
}
|
313 |
+
|
314 |
+
print(payload)
|
315 |
+
|
316 |
+
response = requests.post('http://127.0.0.1:5000/interview', json=payload)
|
317 |
+
|
318 |
+
|
319 |
+
with gr.Blocks(theme=theme, css=".gradio-container { background: rgba(255, 255, 255, 0.2) !important; box-shadow: 0 8px 32px 0 rgba( 31, 38, 135, 0.37 ) !important; backdrop-filter: blur( 10px ) !important; -webkit-backdrop-filter: blur( 10px ) !important; border-radius: 10px !important; border: 1px solid rgba( 0, 0, 0, 0.5 ) !important;}") as Video:
|
320 |
+
input_video = gr.Video(sources=["upload", "webcam"], format='mp4')
|
321 |
+
input_video.stop_recording(fn=video_to_audio, inputs=input_video)
|
322 |
|
323 |
+
Video.launch()
|
|