Spaces:
Sleeping
Sleeping
import time | |
import torch | |
import pandas as pd | |
import gradio as gr | |
import onnxruntime as ort | |
from mediapipe.python.solutions import holistic | |
from utils.model import get_predictions | |
from utils.data import preprocess | |
title = ''' | |
''' | |
cite_markdown = ''' | |
''' | |
description = ''' | |
''' | |
examples = [] | |
# Load the configuration file. | |
ort_session = ort.InferenceSession('VSL_SAM_SLR_V2_joint.onnx') | |
# Load id-to-gloss mapping. | |
id2gloss = pd.read_csv('gloss.csv', names=['id', 'gloss']).to_dict()['gloss'] | |
def inference( | |
video: str, | |
progress: gr.Progress = gr.Progress(), | |
) -> str: | |
''' | |
Video-based inference for Vietnamese Sign Language recognition. | |
Parameters | |
---------- | |
video : str | |
The path to the video. | |
progress : gr.Progress, optional | |
The progress bar, by default gr.Progress() | |
Returns | |
------- | |
str | |
The inference message. | |
''' | |
keypoints_detector = holistic.Holistic( | |
static_image_mode=False, | |
model_complexity=2, | |
enable_segmentation=True, | |
refine_face_landmarks=True, | |
) | |
progress(0, desc='Preprocessing video') | |
start_time = time.time() | |
inputs = preprocess( | |
source=video, | |
keypoints_detector=keypoints_detector, | |
) | |
end_time = time.time() | |
data_time = end_time - start_time | |
progress(1/2, desc='Getting predictions') | |
start_time = time.time() | |
predictions = get_predictions( | |
inputs=inputs, ort_session=ort_session, id2gloss=id2gloss, k=3 | |
) | |
end_time = time.time() | |
model_time = end_time - start_time | |
if len(predictions) == 0: | |
output_message = 'No sign language detected in the video. Please try again.' | |
else: | |
output_message = 'The top-3 predictions are:\n' | |
for i, prediction in enumerate(predictions): | |
output_message += f'\t{i+1}. {prediction["label"]} ({prediction["score"]:2f})\n' | |
output_message += f'Data processing time: {data_time:.2f} seconds\n' | |
output_message += f'Model inference time: {model_time:.2f} seconds\n' | |
output_message += f'Total time: {data_time + model_time:.2f} seconds' | |
output_message += f'\nInput shape: {inputs.shape}' | |
progress(1/2, desc='Completed') | |
return output_message | |
iface = gr.Interface( | |
fn=inference, | |
inputs='video', | |
outputs='text', | |
examples=examples, | |
title=title, | |
description=description, | |
) | |
iface.launch() | |
# print(inference('000_con_cho.mp4')) | |