Spaces:
Runtime error
Runtime error
Upload main files
Browse files- AudioClassifier.py +54 -0
- FacePosition.py +82 -0
- app.py +39 -0
- cursor_movement_model.pkl +3 -0
AudioClassifier.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
AudioClassifier class
|
3 |
+
|
4 |
+
Author: HenryAreiza
|
5 |
+
Date: 08/09/2023
|
6 |
+
"""
|
7 |
+
|
8 |
+
from scipy.io import wavfile
|
9 |
+
from scipy.signal import decimate
|
10 |
+
from transformers import pipeline
|
11 |
+
|
12 |
+
class AudioClassifier:
|
13 |
+
"""
|
14 |
+
A class for classifying audio commands using a pre-trained model.
|
15 |
+
|
16 |
+
This class provides functionality for classifying audio commands based on
|
17 |
+
a pre-trained audio classification model.
|
18 |
+
|
19 |
+
Attributes:
|
20 |
+
vocab (list): Vocabulary of valid commands
|
21 |
+
pipe: The Hugging Face Transformers pipeline for audio classification.
|
22 |
+
"""
|
23 |
+
|
24 |
+
def __init__(self):
|
25 |
+
"""
|
26 |
+
Initializes the AudioClassifier class.
|
27 |
+
"""
|
28 |
+
self.vocab = ["left", "right", "up", "down", "go", "follow",
|
29 |
+
"on", "off", "one", "two", "three", "stop"]
|
30 |
+
|
31 |
+
# Load the audio classification pipeline
|
32 |
+
self.pipe = pipeline("audio-classification", model="0xb1/wav2vec2-base-finetuned-speech_commands-v0.02")
|
33 |
+
|
34 |
+
def predict(self, audio_path):
|
35 |
+
"""
|
36 |
+
Classify audio data into a command label.
|
37 |
+
|
38 |
+
Args:
|
39 |
+
audio_data (numpy.ndarray): Input audio data.
|
40 |
+
|
41 |
+
Returns:
|
42 |
+
result (str): The classified command label.
|
43 |
+
"""
|
44 |
+
_, audio = wavfile.read(audio_path)
|
45 |
+
audio = decimate(audio, 3)
|
46 |
+
result = self.pipe(audio)[0]["label"]
|
47 |
+
|
48 |
+
if result not in self.vocab:
|
49 |
+
result = 'unknown'
|
50 |
+
|
51 |
+
return result
|
52 |
+
|
53 |
+
|
54 |
+
|
FacePosition.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
FacePosition class
|
3 |
+
|
4 |
+
Author: HenryAreiza
|
5 |
+
Date: 08/09/2023
|
6 |
+
"""
|
7 |
+
|
8 |
+
import os
|
9 |
+
import cv2
|
10 |
+
import pickle
|
11 |
+
import numpy as np
|
12 |
+
import mediapipe as mp
|
13 |
+
|
14 |
+
class FacePosition:
|
15 |
+
"""
|
16 |
+
A class for controlling the cursor based on head movements.
|
17 |
+
|
18 |
+
This class provides functionality for detecting a face using
|
19 |
+
the MediaPipe library and controlling the cursor's movement accordingly.
|
20 |
+
|
21 |
+
Attributes:
|
22 |
+
movement (list): List of classes corresponding to the predicted movement.
|
23 |
+
images (list): List of images associated to each class
|
24 |
+
cursor_model: The machine learning model for gesture prediction.
|
25 |
+
face_detection: The MediaPipe Face Detection component.
|
26 |
+
"""
|
27 |
+
|
28 |
+
def __init__(self):
|
29 |
+
"""
|
30 |
+
Initializes the FaceCursorController class.
|
31 |
+
"""
|
32 |
+
self.movement = ['Center', 'Up', 'Right/Up', 'Right', 'Right/Down', 'Down', 'Left/Down', 'Left', 'Left/Up']
|
33 |
+
self.images = [cv2.imread(os.path.join('media', str(i)+'.png')) for i in range(9)]
|
34 |
+
|
35 |
+
# Load the cursor movement model
|
36 |
+
with open('cursor_movement_model.pkl', 'rb') as f:
|
37 |
+
self.cursor_model = pickle.load(f)
|
38 |
+
|
39 |
+
# Initialize the MediaPipe Face Detection component
|
40 |
+
self.face_detection = mp.solutions.face_detection.FaceDetection(min_detection_confidence=0.5)
|
41 |
+
|
42 |
+
def predict(self, frame):
|
43 |
+
"""
|
44 |
+
Move the cursor based on head position.
|
45 |
+
|
46 |
+
Args:
|
47 |
+
reference (list): A list containing reference coordinates and size of the bounding box.
|
48 |
+
keypoints (list): A list of keypoints representing face landmarks.
|
49 |
+
|
50 |
+
Returns:
|
51 |
+
result (list): The predicted class image and label.
|
52 |
+
"""
|
53 |
+
# Perform face detection
|
54 |
+
results = self.face_detection.process(frame)
|
55 |
+
|
56 |
+
# Read the reference and landmarks from the detected face
|
57 |
+
if results.detections:
|
58 |
+
for detection in results.detections:
|
59 |
+
reference = [[detection.location_data.relative_bounding_box.xmin,
|
60 |
+
detection.location_data.relative_bounding_box.ymin],
|
61 |
+
[detection.location_data.relative_bounding_box.width,
|
62 |
+
detection.location_data.relative_bounding_box.height]]
|
63 |
+
keypoints = []
|
64 |
+
for key_point in detection.location_data.relative_keypoints:
|
65 |
+
keypoints.append([key_point.x, key_point.y])
|
66 |
+
break
|
67 |
+
|
68 |
+
# Transform the lists into numpy arrays
|
69 |
+
reference = np.array(reference)
|
70 |
+
keypoints = np.array(keypoints)
|
71 |
+
|
72 |
+
# Remove off-set from keypoints
|
73 |
+
keypoints = (keypoints - reference[0]) / reference[1]
|
74 |
+
|
75 |
+
# Recognize the head position
|
76 |
+
prediction = self.cursor_model.predict(keypoints.reshape((1, -1)))[0]
|
77 |
+
|
78 |
+
return [self.images[prediction], self.movement[prediction]]
|
79 |
+
|
80 |
+
else:
|
81 |
+
return [self.images[0], self.movement[0]]
|
82 |
+
|
app.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from FacePosition import FacePosition
|
3 |
+
from AudioClassifier import AudioClassifier
|
4 |
+
|
5 |
+
|
6 |
+
# Create an instance of the FacePosition class
|
7 |
+
movement_controller = FacePosition()
|
8 |
+
|
9 |
+
cursor_movement = gr.Interface(
|
10 |
+
fn = movement_controller.predict,
|
11 |
+
inputs = gr.Image(source='webcam', streaming=True),
|
12 |
+
outputs = ['image', 'text'],
|
13 |
+
live = True,
|
14 |
+
title = 'Cursor movement controller',
|
15 |
+
description = "This space provides functionality for detecting a face using the MediaPipe library and controlling the cursor's movement accordingly."
|
16 |
+
)
|
17 |
+
|
18 |
+
|
19 |
+
# Create an instance of the AudioClassifier class
|
20 |
+
audio_classifier = AudioClassifier()
|
21 |
+
|
22 |
+
audio_commands = gr.Interface(
|
23 |
+
fn = audio_classifier.predict,
|
24 |
+
inputs = gr.Audio(source="microphone", type="filepath", streaming=True),
|
25 |
+
outputs = "text",
|
26 |
+
live = True,
|
27 |
+
title = 'Speech commands recognition (mouse actions)',
|
28 |
+
description = 'This class provides functionality for classifying audio commands associated to mouse actions, based on a pre-trained audio classification model.'
|
29 |
+
)
|
30 |
+
|
31 |
+
|
32 |
+
demo = gr.TabbedInterface([cursor_movement, audio_commands],
|
33 |
+
title = 'Hands-free Cursor Application',
|
34 |
+
tab_names = ['Cursor movement controller', 'Speech commands recognition'],
|
35 |
+
theme = gr.themes.Soft())
|
36 |
+
|
37 |
+
|
38 |
+
if __name__ == "__main__":
|
39 |
+
demo.launch()
|
cursor_movement_model.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3751a31fbe1163000ffc0ae0e230430475ad150412947a16aef3ebdfb6792d4d
|
3 |
+
size 1696
|