Spaces:

keras-io
/

video-transformers

Runtime error

App Files Files Community

shivi commited on Jul 28, 2022

Commit

3d12539

1 Parent(s): 77d621b

Adding all app files with examples

Browse files

Files changed (10) hide show

app.py +52 -0
examples/v_CricketShot_g01_c01.mp4 +0 -0
examples/v_PlayingCello_g11_c03.mp4 +0 -0
examples/v_Punch_g09_c07.mp4 +0 -0
examples/v_ShavingBeard_g09_c03.mp4 +0 -0
examples/v_TennisSwing_g10_c04.mp4 +0 -0
requirements.txt +4 -0
utils/constants.py +4 -0
utils/custom_layers.py +67 -0
utils/predict.py +102 -0

app.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import gradio as gr
+from utils.predict import predict_action
+import os
+import glob
+##Create Dataset for loading examples
+example_list = glob.glob("examples/*")
+example_list = list(map(lambda el:[el], example_list))
+def load_example(video):
+    return video[0]
+demo = gr.Blocks()
+with demo:
+    gr.Markdown("# **<p align='center'>Video Classification with Transformers</p>**")
+    gr.Markdown("This space demonstrates the use of hybrid Transformer-based models for video classification that operate on CNN feature maps.")
+    with gr.Tabs():
+        with gr.TabItem("Upload & Predict"):
+            with gr.Box():
+                with gr.Row():
+                    input_video = gr.Video(label="Input Video", show_label=True)
+                    output_label = gr.Label(label="Model Output", show_label=True)
+                    output_gif = gr.Image(label="Video Gif", show_label=True)
+            gr.Markdown("**Predict**")
+            with gr.Box():
+                with gr.Row():
+                    submit_button = gr.Button("Submit")
+            gr.Markdown("**Examples:**")
+            gr.Markdown("The model is trained to classify videos belonging to the following classes:")
+            gr.Markdown("CricketShot, PlayingCello, Punch, ShavingBeard, TennisSwing")
+            with gr.Column():
+                # gr.Examples("examples", [input_video], [output_label,output_gif], predict_action, cache_examples=True)
+                examples = gr.components.Dataset(components=[input_video], samples=example_list, type='values')
+                examples.click(load_example, examples, input_video)
+    submit_button.click(predict_action, inputs=input_video, outputs=[output_label,output_gif])
+    gr.Markdown('\n Author: <a href=\"https://www.linkedin.com/in/shivalika-singh/\">Shivalika Singh</a> <br> Based on this <a href=\"https://keras.io/examples/vision/video_transformers/\">Keras example</a> by <a href=\"https://twitter.com/RisingSayak\">Sayak Paul</a> <br> Demo Powered by this <a href=\"https://huggingface.co/shivi/video-transformers/\"> Video Classification</a> model')
+demo.launch()

examples/v_CricketShot_g01_c01.mp4 ADDED Viewed

Binary file (138 kB). View file

examples/v_PlayingCello_g11_c03.mp4 ADDED Viewed

Binary file (401 kB). View file

examples/v_Punch_g09_c07.mp4 ADDED Viewed

Binary file (705 kB). View file

examples/v_ShavingBeard_g09_c03.mp4 ADDED Viewed

Binary file (206 kB). View file

examples/v_TennisSwing_g10_c04.mp4 ADDED Viewed

Binary file (75.6 kB). View file

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+tensorflow
+gradio
+opencv-python
+imageio

utils/constants.py ADDED Viewed

	@@ -0,0 +1,4 @@

+MAX_SEQ_LENGTH = 20
+NUM_FEATURES = 1024
+IMG_SIZE = 128
+CLASS_VOCAB = ['CricketShot', 'PlayingCello', 'Punch', 'ShavingBeard', 'TennisSwing']

utils/custom_layers.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import tensorflow as tf
+from tensorflow import keras
+from keras import layers
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, output_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=output_dim
+        )
+        self.sequence_length = sequence_length
+        self.output_dim = output_dim
+    def call(self, inputs):
+        # The inputs are of shape: `(batch_size, frames, num_features)`
+        length = tf.shape(inputs)[1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_positions = self.position_embeddings(positions)
+        return inputs + embedded_positions
+    def compute_mask(self, inputs, mask=None):
+        mask = tf.reduce_any(tf.cast(inputs, "bool"), axis=-1)
+        return mask
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "sequence_length": self.sequence_length,
+            "output_dim": self.output_dim,
+        })
+        return config
+class TransformerEncoder(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
+        )
+        self.dense_proj = keras.Sequential(
+            [layers.Dense(dense_dim, activation=tf.nn.gelu), layers.Dense(embed_dim),]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+    def call(self, inputs, mask=None):
+        if mask is not None:
+            mask = mask[:, tf.newaxis, :]
+        attention_output = self.attention(inputs, inputs, attention_mask=mask)
+        proj_input = self.layernorm_1(inputs + attention_output)
+        proj_output = self.dense_proj(proj_input)
+        return self.layernorm_2(proj_input + proj_output)
+    def get_config(self):
+        config = super().get_config()
+        config.update({
+            "embed_dim": self.embed_dim,
+            "dense_dim": self.dense_dim,
+            "num_heads": self.num_heads,
+        })
+        return config

utils/predict.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from .custom_layers import TransformerEncoder, PositionalEmbedding
+from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
+from huggingface_hub import from_pretrained_keras
+from tensorflow import keras
+from keras import layers
+import numpy as np
+import imageio
+import cv2
+model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})
+# model = from_pretrained_keras("shivi/video-transformers")
+def build_feature_extractor():
+    feature_extractor = keras.applications.DenseNet121(
+        weights="imagenet",
+        include_top=False,
+        pooling="avg",
+        input_shape=(IMG_SIZE, IMG_SIZE, 3),
+    )
+    preprocess_input = keras.applications.densenet.preprocess_input
+    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
+    preprocessed = preprocess_input(inputs)
+    outputs = feature_extractor(preprocessed)
+    return keras.Model(inputs, outputs, name="feature_extractor")
+feature_extractor = build_feature_extractor()
+def crop_center(frame):
+    center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
+    cropped = center_crop_layer(frame[None, ...])
+    cropped = cropped.numpy().squeeze()
+    return cropped
+def load_video(path, max_frames=0):
+    cap = cv2.VideoCapture(path)
+    frames = []
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = crop_center(frame)
+            frame = frame[:, :, [2, 1, 0]]
+            frames.append(frame)
+            if len(frames) == max_frames:
+                break
+    finally:
+        cap.release()
+    return np.array(frames)
+def prepare_single_video(frames):
+    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
+    # Pad shorter videos.
+    if len(frames) < MAX_SEQ_LENGTH:
+        diff = MAX_SEQ_LENGTH - len(frames)
+        padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
+        frames = np.concatenate(frames, padding)
+    frames = frames[None, ...]
+    # Extract features from the frames of the current video.
+    for i, batch in enumerate(frames):
+        video_length = batch.shape[0]
+        length = min(MAX_SEQ_LENGTH, video_length)
+        for j in range(length):
+            if np.mean(batch[j, :]) > 0.0:
+                frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
+            else:
+                frame_features[i, j, :] = 0.0
+    return frame_features
+def predict_action(path):
+    frames = load_video(path)
+    frame_features = prepare_single_video(frames)
+    probabilities = model.predict(frame_features)[0]
+    confidences = {}
+    for i in np.argsort(probabilities)[::-1]:
+        confidences[CLASS_VOCAB[i]] = float(probabilities[i])
+    gif_out = to_gif(frames[:MAX_SEQ_LENGTH])
+    # gif_out = gen_moviepy_gif(path, start_seconds, end_seconds)
+    print(confidences)
+    return confidences, gif_out
+def to_gif(images):
+    converted_images = images.astype(np.uint8)
+    imageio.mimsave("animation.gif", converted_images, fps=10)
+    return "animation.gif"