import tensorflow as tf import cv2 import gradio as gr import random import numpy as np import pandas as pd tflite_filename = 'model-400.tflite' interpreter = tf.lite.Interpreter(model_path=tflite_filename) runner = interpreter.get_signature_runner() init_states = { name: tf.zeros(x['shape'], dtype=x['dtype']) for name, x in runner.get_input_details().items() } del init_states['image'] data = pd.read_csv('labels.csv', header=None) CLASSES = data[1].values def format_frames(frame, output_size): """ Pad and resize an image from a video. Args: frame: Image that needs to resized and padded. output_size: Pixel size of the output frame image. Return: Formatted frame with padding of specified output size. """ frame = tf.image.convert_image_dtype(frame, tf.float32) frame = tf.image.resize_with_pad(frame, *output_size) return frame def frames_from_video_file(video_path, n_frames, output_size = (224,224)): """ Creates frames from each video file present for each category. Args: video_path: File path to the video. n_frames: Number of frames to be created per video file. output_size: Pixel size of the output frame image. Return: An NumPy array of frames in the shape of (n_frames, height, width, channels). """ # Read each video frame by frame result = [] src = cv2.VideoCapture(str(video_path)) video_length = src.get(cv2.CAP_PROP_FRAME_COUNT) frame_step = max(1, int(video_length / n_frames)) need_length = 1 + (n_frames - 1) * frame_step if need_length > video_length: start = 0 else: max_start = video_length - need_length start = random.randint(0, max_start + 1) src.set(cv2.CAP_PROP_POS_FRAMES, start) # ret is a boolean indicating whether read was successful, frame is the image itself ret, frame = src.read() result.append(format_frames(frame, output_size)) for _ in range(n_frames - 1): for _ in range(frame_step): ret, frame = src.read() if ret: frame = format_frames(frame, output_size) result.append(frame) else: result.append(np.zeros_like(result[0])) src.release() result = np.array(result)[..., [2, 1, 0]].reshape((1, n_frames, *output_size, 3)) return result def get_top_k(probs, k=5, label_map=CLASSES): """Outputs the top k model labels and probabilities on the given video.""" top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k] top_labels = tf.gather(label_map, top_predictions, axis=-1) top_labels = [label.decode('utf8') for label in top_labels.numpy()] top_probs = tf.gather(probs, top_predictions, axis=-1).numpy() return tuple(zip(top_labels, top_probs)) def inferenece(video): video = frames_from_video_file(video, 13) clips = tf.split(video, video.shape[1], axis=1) states = init_states for clip in clips: outputs = runner(**states, image=clip) logits = outputs.pop('logits')[0] states = outputs probs = tf.nn.softmax(logits) top_k = get_top_k(probs) result_str = '\n'.join([f'{label}: {prob:.4f}' for label, prob in top_k]) return result_str demo = gr.Interface(fn=inferenece, inputs='video', outputs='text') demo.launch()