Spaces:
Runtime error
Runtime error
Adding all app files with examples
Browse files- app.py +52 -0
- examples/v_CricketShot_g01_c01.mp4 +0 -0
- examples/v_PlayingCello_g11_c03.mp4 +0 -0
- examples/v_Punch_g09_c07.mp4 +0 -0
- examples/v_ShavingBeard_g09_c03.mp4 +0 -0
- examples/v_TennisSwing_g10_c04.mp4 +0 -0
- requirements.txt +4 -0
- utils/constants.py +4 -0
- utils/custom_layers.py +67 -0
- utils/predict.py +102 -0
app.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
from utils.predict import predict_action
|
3 |
+
import os
|
4 |
+
import glob
|
5 |
+
|
6 |
+
##Create Dataset for loading examples
|
7 |
+
example_list = glob.glob("examples/*")
|
8 |
+
example_list = list(map(lambda el:[el], example_list))
|
9 |
+
|
10 |
+
|
11 |
+
def load_example(video):
|
12 |
+
return video[0]
|
13 |
+
|
14 |
+
demo = gr.Blocks()
|
15 |
+
|
16 |
+
|
17 |
+
|
18 |
+
with demo:
|
19 |
+
|
20 |
+
gr.Markdown("# **<p align='center'>Video Classification with Transformers</p>**")
|
21 |
+
gr.Markdown("This space demonstrates the use of hybrid Transformer-based models for video classification that operate on CNN feature maps.")
|
22 |
+
|
23 |
+
with gr.Tabs():
|
24 |
+
|
25 |
+
with gr.TabItem("Upload & Predict"):
|
26 |
+
with gr.Box():
|
27 |
+
|
28 |
+
with gr.Row():
|
29 |
+
input_video = gr.Video(label="Input Video", show_label=True)
|
30 |
+
output_label = gr.Label(label="Model Output", show_label=True)
|
31 |
+
output_gif = gr.Image(label="Video Gif", show_label=True)
|
32 |
+
|
33 |
+
gr.Markdown("**Predict**")
|
34 |
+
|
35 |
+
with gr.Box():
|
36 |
+
with gr.Row():
|
37 |
+
submit_button = gr.Button("Submit")
|
38 |
+
|
39 |
+
gr.Markdown("**Examples:**")
|
40 |
+
gr.Markdown("The model is trained to classify videos belonging to the following classes:")
|
41 |
+
gr.Markdown("CricketShot, PlayingCello, Punch, ShavingBeard, TennisSwing")
|
42 |
+
|
43 |
+
with gr.Column():
|
44 |
+
# gr.Examples("examples", [input_video], [output_label,output_gif], predict_action, cache_examples=True)
|
45 |
+
examples = gr.components.Dataset(components=[input_video], samples=example_list, type='values')
|
46 |
+
examples.click(load_example, examples, input_video)
|
47 |
+
|
48 |
+
submit_button.click(predict_action, inputs=input_video, outputs=[output_label,output_gif])
|
49 |
+
|
50 |
+
gr.Markdown('\n Author: <a href=\"https://www.linkedin.com/in/shivalika-singh/\">Shivalika Singh</a> <br> Based on this <a href=\"https://keras.io/examples/vision/video_transformers/\">Keras example</a> by <a href=\"https://twitter.com/RisingSayak\">Sayak Paul</a> <br> Demo Powered by this <a href=\"https://huggingface.co/shivi/video-transformers/\"> Video Classification</a> model')
|
51 |
+
|
52 |
+
demo.launch()
|
examples/v_CricketShot_g01_c01.mp4
ADDED
Binary file (138 kB). View file
|
|
examples/v_PlayingCello_g11_c03.mp4
ADDED
Binary file (401 kB). View file
|
|
examples/v_Punch_g09_c07.mp4
ADDED
Binary file (705 kB). View file
|
|
examples/v_ShavingBeard_g09_c03.mp4
ADDED
Binary file (206 kB). View file
|
|
examples/v_TennisSwing_g10_c04.mp4
ADDED
Binary file (75.6 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
tensorflow
|
2 |
+
gradio
|
3 |
+
opencv-python
|
4 |
+
imageio
|
utils/constants.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MAX_SEQ_LENGTH = 20
|
2 |
+
NUM_FEATURES = 1024
|
3 |
+
IMG_SIZE = 128
|
4 |
+
CLASS_VOCAB = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']
|
utils/custom_layers.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from tensorflow import keras
|
3 |
+
from keras import layers
|
4 |
+
|
5 |
+
|
6 |
+
class PositionalEmbedding(layers.Layer):
|
7 |
+
def __init__(self, sequence_length, output_dim, **kwargs):
|
8 |
+
super().__init__(**kwargs)
|
9 |
+
self.position_embeddings = layers.Embedding(
|
10 |
+
input_dim=sequence_length, output_dim=output_dim
|
11 |
+
)
|
12 |
+
self.sequence_length = sequence_length
|
13 |
+
self.output_dim = output_dim
|
14 |
+
|
15 |
+
def call(self, inputs):
|
16 |
+
# The inputs are of shape: `(batch_size, frames, num_features)`
|
17 |
+
length = tf.shape(inputs)[1]
|
18 |
+
positions = tf.range(start=0, limit=length, delta=1)
|
19 |
+
embedded_positions = self.position_embeddings(positions)
|
20 |
+
return inputs + embedded_positions
|
21 |
+
|
22 |
+
def compute_mask(self, inputs, mask=None):
|
23 |
+
mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
|
24 |
+
return mask
|
25 |
+
|
26 |
+
def get_config(self):
|
27 |
+
config = super().get_config()
|
28 |
+
config.update({
|
29 |
+
"sequence_length": self.sequence_length,
|
30 |
+
"output_dim": self.output_dim,
|
31 |
+
})
|
32 |
+
return config
|
33 |
+
|
34 |
+
|
35 |
+
class TransformerEncoder(layers.Layer):
|
36 |
+
def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
|
37 |
+
super().__init__(**kwargs)
|
38 |
+
self.embed_dim = embed_dim
|
39 |
+
self.dense_dim = dense_dim
|
40 |
+
self.num_heads = num_heads
|
41 |
+
self.attention = layers.MultiHeadAttention(
|
42 |
+
num_heads=num_heads, key_dim=embed_dim, dropout=0.3
|
43 |
+
)
|
44 |
+
self.dense_proj = keras.Sequential(
|
45 |
+
[layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
|
46 |
+
)
|
47 |
+
self.layernorm_1 = layers.LayerNormalization()
|
48 |
+
self.layernorm_2 = layers.LayerNormalization()
|
49 |
+
|
50 |
+
def call(self, inputs, mask=None):
|
51 |
+
if mask is not None:
|
52 |
+
mask = mask[:, tf.newaxis, :]
|
53 |
+
|
54 |
+
attention_output = self.attention(inputs, inputs, attention_mask=mask)
|
55 |
+
proj_input = self.layernorm_1(inputs + attention_output)
|
56 |
+
proj_output = self.dense_proj(proj_input)
|
57 |
+
return self.layernorm_2(proj_input + proj_output)
|
58 |
+
|
59 |
+
|
60 |
+
def get_config(self):
|
61 |
+
config = super().get_config()
|
62 |
+
config.update({
|
63 |
+
"embed_dim": self.embed_dim,
|
64 |
+
"dense_dim": self.dense_dim,
|
65 |
+
"num_heads": self.num_heads,
|
66 |
+
})
|
67 |
+
return config
|
utils/predict.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .custom_layers import TransformerEncoder, PositionalEmbedding
|
2 |
+
from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
|
3 |
+
from huggingface_hub import from_pretrained_keras
|
4 |
+
from tensorflow import keras
|
5 |
+
from keras import layers
|
6 |
+
import numpy as np
|
7 |
+
import imageio
|
8 |
+
import cv2
|
9 |
+
|
10 |
+
model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})
|
11 |
+
|
12 |
+
# model = from_pretrained_keras("shivi/video-transformers")
|
13 |
+
|
14 |
+
def build_feature_extractor():
|
15 |
+
feature_extractor = keras.applications.DenseNet121(
|
16 |
+
weights="imagenet",
|
17 |
+
include_top=False,
|
18 |
+
pooling="avg",
|
19 |
+
input_shape=(IMG_SIZE, IMG_SIZE, 3),
|
20 |
+
)
|
21 |
+
preprocess_input = keras.applications.densenet.preprocess_input
|
22 |
+
|
23 |
+
inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
|
24 |
+
preprocessed = preprocess_input(inputs)
|
25 |
+
|
26 |
+
outputs = feature_extractor(preprocessed)
|
27 |
+
return keras.Model(inputs, outputs, name="feature_extractor")
|
28 |
+
|
29 |
+
|
30 |
+
feature_extractor = build_feature_extractor()
|
31 |
+
|
32 |
+
|
33 |
+
|
34 |
+
def crop_center(frame):
|
35 |
+
center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
|
36 |
+
cropped = center_crop_layer(frame[None, ...])
|
37 |
+
cropped = cropped.numpy().squeeze()
|
38 |
+
return cropped
|
39 |
+
|
40 |
+
def load_video(path, max_frames=0):
|
41 |
+
cap = cv2.VideoCapture(path)
|
42 |
+
frames = []
|
43 |
+
try:
|
44 |
+
while True:
|
45 |
+
ret, frame = cap.read()
|
46 |
+
if not ret:
|
47 |
+
break
|
48 |
+
frame = crop_center(frame)
|
49 |
+
frame = frame[:, :, [2, 1, 0]]
|
50 |
+
frames.append(frame)
|
51 |
+
|
52 |
+
if len(frames) == max_frames:
|
53 |
+
break
|
54 |
+
finally:
|
55 |
+
cap.release()
|
56 |
+
return np.array(frames)
|
57 |
+
|
58 |
+
def prepare_single_video(frames):
|
59 |
+
frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
|
60 |
+
|
61 |
+
# Pad shorter videos.
|
62 |
+
if len(frames) < MAX_SEQ_LENGTH:
|
63 |
+
diff = MAX_SEQ_LENGTH - len(frames)
|
64 |
+
padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
|
65 |
+
frames = np.concatenate(frames, padding)
|
66 |
+
|
67 |
+
frames = frames[None, ...]
|
68 |
+
|
69 |
+
# Extract features from the frames of the current video.
|
70 |
+
for i, batch in enumerate(frames):
|
71 |
+
video_length = batch.shape[0]
|
72 |
+
length = min(MAX_SEQ_LENGTH, video_length)
|
73 |
+
for j in range(length):
|
74 |
+
if np.mean(batch[j, :]) > 0.0:
|
75 |
+
frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
|
76 |
+
else:
|
77 |
+
frame_features[i, j, :] = 0.0
|
78 |
+
|
79 |
+
return frame_features
|
80 |
+
|
81 |
+
|
82 |
+
def predict_action(path):
|
83 |
+
frames = load_video(path)
|
84 |
+
frame_features = prepare_single_video(frames)
|
85 |
+
probabilities = model.predict(frame_features)[0]
|
86 |
+
confidences = {}
|
87 |
+
|
88 |
+
for i in np.argsort(probabilities)[::-1]:
|
89 |
+
confidences[CLASS_VOCAB[i]] = float(probabilities[i])
|
90 |
+
|
91 |
+
gif_out = to_gif(frames[:MAX_SEQ_LENGTH])
|
92 |
+
# gif_out = gen_moviepy_gif(path, start_seconds, end_seconds)
|
93 |
+
|
94 |
+
print(confidences)
|
95 |
+
return confidences, gif_out
|
96 |
+
|
97 |
+
|
98 |
+
def to_gif(images):
|
99 |
+
converted_images = images.astype(np.uint8)
|
100 |
+
imageio.mimsave("animation.gif", converted_images, fps=10)
|
101 |
+
return "animation.gif"
|
102 |
+
|