Spaces:

keras-io
/

video-transformers

Runtime error

App Files Files Community

video-transformers / utils /predict.py

shivi

Update utils/predict.py

15d73ae over 2 years ago

raw

history blame contribute delete

3.15 kB

	#from .custom_layers import TransformerEncoder, PositionalEmbedding
	from .constants import MAX_SEQ_LENGTH, NUM_FEATURES, IMG_SIZE, CLASS_VOCAB
	from huggingface_hub import from_pretrained_keras
	from tensorflow import keras
	from keras import layers
	import numpy as np
	import imageio
	import cv2

	#model = from_pretrained_keras("shivi/video-classification",custom_objects={"PositionalEmbedding":PositionalEmbedding,"TransformerEncoder": TransformerEncoder})

	model = from_pretrained_keras("keras-io/video-transformers")

	"""
	Below code is taken from the Video-Transformers example on keras-io by Sayak Paul
	"""
	def build_feature_extractor():
	feature_extractor = keras.applications.DenseNet121(
	weights="imagenet",
	include_top=False,
	pooling="avg",
	input_shape=(IMG_SIZE, IMG_SIZE, 3),
	)
	preprocess_input = keras.applications.densenet.preprocess_input

	inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
	preprocessed = preprocess_input(inputs)

	outputs = feature_extractor(preprocessed)
	return keras.Model(inputs, outputs, name="feature_extractor")


	feature_extractor = build_feature_extractor()



	def crop_center(frame):
	center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
	cropped = center_crop_layer(frame[None, ...])
	cropped = cropped.numpy().squeeze()
	return cropped

	def load_video(path, max_frames=0):
	cap = cv2.VideoCapture(path)
	frames = []
	try:
	while True:
	ret, frame = cap.read()
	if not ret:
	break
	frame = crop_center(frame)
	frame = frame[:, :, [2, 1, 0]]
	frames.append(frame)

	if len(frames) == max_frames:
	break
	finally:
	cap.release()
	return np.array(frames)

	def prepare_single_video(frames):
	frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")

	# Pad shorter videos.
	if len(frames) < MAX_SEQ_LENGTH:
	diff = MAX_SEQ_LENGTH - len(frames)
	padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
	frames = np.concatenate(frames, padding)

	frames = frames[None, ...]

	# Extract features from the frames of the current video.
	for i, batch in enumerate(frames):
	video_length = batch.shape[0]
	length = min(MAX_SEQ_LENGTH, video_length)
	for j in range(length):
	if np.mean(batch[j, :]) > 0.0:
	frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
	else:
	frame_features[i, j, :] = 0.0

	return frame_features


	def predict_action(path):
	frames = load_video(path)
	frame_features = prepare_single_video(frames)
	probabilities = model.predict(frame_features)[0]
	confidences = {}

	for i in np.argsort(probabilities)[::-1]:
	confidences[CLASS_VOCAB[i]] = float(probabilities[i])

	gif_out = to_gif(frames[:MAX_SEQ_LENGTH])

	print(confidences)
	return confidences, gif_out


	def to_gif(images):
	converted_images = images.astype(np.uint8)
	imageio.mimsave("animation.gif", converted_images, fps=10)
	return "animation.gif"