|
import tensorflow as tf |
|
import cv2 |
|
import gradio as gr |
|
import random |
|
import numpy as np |
|
import pandas as pd |
|
tflite_filename = 'model-400.tflite' |
|
interpreter = tf.lite.Interpreter(model_path=tflite_filename) |
|
runner = interpreter.get_signature_runner() |
|
init_states = { |
|
name: tf.zeros(x['shape'], dtype=x['dtype']) |
|
for name, x in runner.get_input_details().items() |
|
} |
|
del init_states['image'] |
|
|
|
data = pd.read_csv('labels.csv', header=None) |
|
CLASSES = data[1].values |
|
def format_frames(frame, output_size): |
|
""" |
|
Pad and resize an image from a video. |
|
|
|
Args: |
|
frame: Image that needs to resized and padded. |
|
output_size: Pixel size of the output frame image. |
|
|
|
Return: |
|
Formatted frame with padding of specified output size. |
|
""" |
|
frame = tf.image.convert_image_dtype(frame, tf.float32) |
|
frame = tf.image.resize_with_pad(frame, *output_size) |
|
return frame |
|
def frames_from_video_file(video_path, n_frames, output_size = (224,224)): |
|
""" |
|
Creates frames from each video file present for each category. |
|
|
|
Args: |
|
video_path: File path to the video. |
|
n_frames: Number of frames to be created per video file. |
|
output_size: Pixel size of the output frame image. |
|
|
|
Return: |
|
An NumPy array of frames in the shape of (n_frames, height, width, channels). |
|
""" |
|
|
|
result = [] |
|
src = cv2.VideoCapture(str(video_path)) |
|
|
|
video_length = src.get(cv2.CAP_PROP_FRAME_COUNT) |
|
|
|
frame_step = max(1, int(video_length / n_frames)) |
|
|
|
need_length = 1 + (n_frames - 1) * frame_step |
|
|
|
if need_length > video_length: |
|
start = 0 |
|
else: |
|
max_start = video_length - need_length |
|
start = random.randint(0, max_start + 1) |
|
|
|
src.set(cv2.CAP_PROP_POS_FRAMES, start) |
|
|
|
ret, frame = src.read() |
|
result.append(format_frames(frame, output_size)) |
|
|
|
for _ in range(n_frames - 1): |
|
for _ in range(frame_step): |
|
ret, frame = src.read() |
|
if ret: |
|
frame = format_frames(frame, output_size) |
|
result.append(frame) |
|
else: |
|
result.append(np.zeros_like(result[0])) |
|
src.release() |
|
result = np.array(result)[..., [2, 1, 0]].reshape((1, n_frames, *output_size, 3)) |
|
|
|
return result |
|
|
|
def get_top_k(probs, k=5, label_map=CLASSES): |
|
"""Outputs the top k model labels and probabilities on the given video.""" |
|
top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k] |
|
top_labels = tf.gather(label_map, top_predictions, axis=-1) |
|
top_labels = [label.decode('utf8') for label in top_labels.numpy()] |
|
top_probs = tf.gather(probs, top_predictions, axis=-1).numpy() |
|
return tuple(zip(top_labels, top_probs)) |
|
|
|
def inferenece(video): |
|
video = frames_from_video_file(video, 13) |
|
clips = tf.split(video, video.shape[1], axis=1) |
|
states = init_states |
|
for clip in clips: |
|
outputs = runner(**states, image=clip) |
|
logits = outputs.pop('logits')[0] |
|
states = outputs |
|
probs = tf.nn.softmax(logits) |
|
top_k = get_top_k(probs) |
|
result_str = '\n'.join([f'{label}: {prob:.4f}' for label, prob in top_k]) |
|
return result_str |
|
|
|
demo = gr.Interface(fn=inferenece, inputs='video', outputs='text') |
|
demo.launch() |