import tensorflow as tf
import cv2
import gradio as gr
import random
import numpy as np
import pandas as pd
tflite_filename = 'model-400.tflite'
interpreter = tf.lite.Interpreter(model_path=tflite_filename)
runner = interpreter.get_signature_runner()
init_states = {
    name: tf.zeros(x['shape'], dtype=x['dtype'])
    for name, x in runner.get_input_details().items()
}
del init_states['image']

data = pd.read_csv('labels.csv', header=None)
CLASSES = data[1].values
def format_frames(frame, output_size):
    """
        Pad and resize an image from a video.

        Args:
        frame: Image that needs to resized and padded. 
        output_size: Pixel size of the output frame image.

        Return:
        Formatted frame with padding of specified output size.
    """
    frame = tf.image.convert_image_dtype(frame, tf.float32)
    frame = tf.image.resize_with_pad(frame, *output_size)
    return frame
def frames_from_video_file(video_path, n_frames, output_size = (224,224)):
    """
        Creates frames from each video file present for each category.

        Args:
        video_path: File path to the video.
        n_frames: Number of frames to be created per video file.
        output_size: Pixel size of the output frame image.

        Return:
        An NumPy array of frames in the shape of (n_frames, height, width, channels).
    """
    # Read each video frame by frame
    result = []
    src = cv2.VideoCapture(str(video_path))  

    video_length = src.get(cv2.CAP_PROP_FRAME_COUNT)

    frame_step = max(1, int(video_length / n_frames))

    need_length = 1 + (n_frames - 1) * frame_step

    if need_length > video_length:
        start = 0
    else:
        max_start = video_length - need_length
        start = random.randint(0, max_start + 1)

    src.set(cv2.CAP_PROP_POS_FRAMES, start)
    # ret is a boolean indicating whether read was successful, frame is the image itself
    ret, frame = src.read()
    result.append(format_frames(frame, output_size))

    for _ in range(n_frames - 1):
        for _ in range(frame_step):
            ret, frame = src.read()
        if ret:
            frame = format_frames(frame, output_size)
            result.append(frame)
        else:
            result.append(np.zeros_like(result[0]))
    src.release()
    result = np.array(result)[..., [2, 1, 0]].reshape((1, n_frames, *output_size, 3))

    return result

def get_top_k(probs, k=5, label_map=CLASSES):
    """Outputs the top k model labels and probabilities on the given video."""
    top_predictions = tf.argsort(probs, axis=-1, direction='DESCENDING')[:k]
    top_labels = tf.gather(label_map, top_predictions, axis=-1)
    top_labels = [label.decode('utf8') for label in top_labels.numpy()]
    top_probs = tf.gather(probs, top_predictions, axis=-1).numpy()
    return tuple(zip(top_labels, top_probs))

def inferenece(video):
    video = frames_from_video_file(video, 13)
    clips = tf.split(video, video.shape[1], axis=1)
    states = init_states
    for clip in clips:
        outputs = runner(**states, image=clip)
        logits = outputs.pop('logits')[0]
        states = outputs
    probs = tf.nn.softmax(logits)
    top_k = get_top_k(probs)
    result_str = '\n'.join([f'{label}: {prob:.4f}' for label, prob in top_k])
    return result_str

demo = gr.Interface(fn=inferenece, inputs='video', outputs='text')
demo.launch()