Spaces:
Runtime error
Runtime error
#from .custom_layers import TransformerEncoder, PositionalEmbedding | |
from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB | |
from huggingface_hub import from_pretrained_keras | |
from tensorflow import keras | |
from keras import layers | |
import numpy as np | |
import imageio | |
import cv2 | |
#model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder}) | |
model = from_pretrained_keras("keras-io/video-transformers") | |
""" | |
Below code is taken from the Video-Transformers example on keras-io by Sayak Paul | |
""" | |
def build_feature_extractor(): | |
feature_extractor = keras.applications.DenseNet121( | |
weights="imagenet", | |
include_top=False, | |
pooling="avg", | |
input_shape=(IMG_SIZE, IMG_SIZE, 3), | |
) | |
preprocess_input = keras.applications.densenet.preprocess_input | |
inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3)) | |
preprocessed = preprocess_input(inputs) | |
outputs = feature_extractor(preprocessed) | |
return keras.Model(inputs, outputs, name="feature_extractor") | |
feature_extractor = build_feature_extractor() | |
def crop_center(frame): | |
center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE) | |
cropped = center_crop_layer(frame[None, ...]) | |
cropped = cropped.numpy().squeeze() | |
return cropped | |
def load_video(path, max_frames=0): | |
cap = cv2.VideoCapture(path) | |
frames = [] | |
try: | |
while True: | |
ret, frame = cap.read() | |
if not ret: | |
break | |
frame = crop_center(frame) | |
frame = frame[:, :, [2, 1, 0]] | |
frames.append(frame) | |
if len(frames) == max_frames: | |
break | |
finally: | |
cap.release() | |
return np.array(frames) | |
def prepare_single_video(frames): | |
frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32") | |
# Pad shorter videos. | |
if len(frames) < MAX_SEQ_LENGTH: | |
diff = MAX_SEQ_LENGTH - len(frames) | |
padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3)) | |
frames = np.concatenate(frames, padding) | |
frames = frames[None, ...] | |
# Extract features from the frames of the current video. | |
for i, batch in enumerate(frames): | |
video_length = batch.shape[0] | |
length = min(MAX_SEQ_LENGTH, video_length) | |
for j in range(length): | |
if np.mean(batch[j, :]) > 0.0: | |
frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :]) | |
else: | |
frame_features[i, j, :] = 0.0 | |
return frame_features | |
def predict_action(path): | |
frames = load_video(path) | |
frame_features = prepare_single_video(frames) | |
probabilities = model.predict(frame_features)[0] | |
confidences = {} | |
for i in np.argsort(probabilities)[::-1]: | |
confidences[CLASS_VOCAB[i]] = float(probabilities[i]) | |
gif_out = to_gif(frames[:MAX_SEQ_LENGTH]) | |
print(confidences) | |
return confidences, gif_out | |
def to_gif(images): | |
converted_images = images.astype(np.uint8) | |
imageio.mimsave("animation.gif", converted_images, fps=10) | |
return "animation.gif" | |