IbrahimHasani's picture
Create app.py
56de2d4
raw
history blame
4.02 kB
import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu
def sample_uniform_frame_indices(clip_len, seg_len):
"""
Samples `clip_len` uniformly spaced frame indices from a video of length `seg_len`.
Handles edge cases where `seg_len` might be less than `clip_len`.
"""
if seg_len < clip_len:
repeat_factor = np.ceil(clip_len / seg_len).astype(int)
indices = np.arange(seg_len).tolist() * repeat_factor
indices = indices[:clip_len]
else:
spacing = seg_len // clip_len
indices = [i * spacing for i in range(clip_len)]
return np.array(indices).astype(np.int64)
def read_video_decord(file_path, indices):
vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
video = vr.get_batch(indices).asnumpy()
return video
def concatenate_frames(frames, clip_len):
assert len(frames) == clip_len, f"The function expects {clip_len} frames as input."
layout = {
32: (4, 8),
16: (4, 4),
8: (2, 4)
}
rows, cols = layout[clip_len]
combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
frame_iter = iter(frames)
y_offset = 0
for i in range(rows):
x_offset = 0
for j in range(cols):
img = Image.fromarray(next(frame_iter))
combined_image.paste(img, (x_offset, y_offset))
x_offset += frames[0].shape[1]
y_offset += frames[0].shape[0]
return combined_image
def model_interface(uploaded_video, model_choice, activities):
clip_len = {
"microsoft/xclip-base-patch16-zero-shot": 32,
"microsoft/xclip-base-patch32-16-frames": 16,
"microsoft/xclip-base-patch32": 8
}.get(model_choice, 32)
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
video = read_video_decord(uploaded_video, indices)
concatenated_image = concatenate_frames(video, clip_len) # Passed clip_len as argument
processor = AutoProcessor.from_pretrained(model_choice)
model = AutoModel.from_pretrained(model_choice)
activities_list = activities.split(",")
inputs = processor(
text=activities_list,
videos=list(video),
return_tensors="pt",
padding=True,
)
with torch.no_grad():
outputs = model(**inputs)
logits_per_video = outputs.logits_per_video
probs = logits_per_video.softmax(dim=1)
results_probs = []
results_logits = []
for i in range(len(activities_list)):
activity = activities_list[i]
prob = float(probs[0][i])
logit = float(logits_per_video[0][i])
results_probs.append((activity, f"Probability: {prob * 100:.2f}%"))
results_logits.append((activity, f"Raw Score: {logit:.2f}"))
# Retrieve most likely predicted label and its probability
max_prob_idx = probs[0].argmax().item()
most_likely_activity = activities_list[max_prob_idx]
most_likely_prob = float(probs[0][max_prob_idx])
return concatenated_image, results_probs, results_logits, (most_likely_activity, f"Probability: {most_likely_prob * 100:.2f}%")
iface = gr.Interface(
fn=model_interface,
inputs=[
gr.components.Video(label="Upload a video file"),
gr.components.Dropdown(choices=[
"microsoft/xclip-base-patch16-zero-shot",
"microsoft/xclip-base-patch32-16-frames",
"microsoft/xclip-base-patch32"
], label="Model Choice"),
gr.components.Textbox(lines=4, label="Enter activities (comma-separated)"),
],
outputs=[
gr.components.Image(type="pil", label="sampled frames"),
gr.components.Textbox(type="text", label="Probabilities"),
gr.components.Textbox(type="text", label="Raw Scores"),
gr.components.Textbox(type="text", label="Most Likely Prediction")
],
live=False
)
iface.launch()