Spaces:
Runtime error
Runtime error
File size: 4,021 Bytes
56de2d4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu
def sample_uniform_frame_indices(clip_len, seg_len):
"""
Samples `clip_len` uniformly spaced frame indices from a video of length `seg_len`.
Handles edge cases where `seg_len` might be less than `clip_len`.
"""
if seg_len < clip_len:
repeat_factor = np.ceil(clip_len / seg_len).astype(int)
indices = np.arange(seg_len).tolist() * repeat_factor
indices = indices[:clip_len]
else:
spacing = seg_len // clip_len
indices = [i * spacing for i in range(clip_len)]
return np.array(indices).astype(np.int64)
def read_video_decord(file_path, indices):
vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
video = vr.get_batch(indices).asnumpy()
return video
def concatenate_frames(frames, clip_len):
assert len(frames) == clip_len, f"The function expects {clip_len} frames as input."
layout = {
32: (4, 8),
16: (4, 4),
8: (2, 4)
}
rows, cols = layout[clip_len]
combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
frame_iter = iter(frames)
y_offset = 0
for i in range(rows):
x_offset = 0
for j in range(cols):
img = Image.fromarray(next(frame_iter))
combined_image.paste(img, (x_offset, y_offset))
x_offset += frames[0].shape[1]
y_offset += frames[0].shape[0]
return combined_image
def model_interface(uploaded_video, model_choice, activities):
clip_len = {
"microsoft/xclip-base-patch16-zero-shot": 32,
"microsoft/xclip-base-patch32-16-frames": 16,
"microsoft/xclip-base-patch32": 8
}.get(model_choice, 32)
indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
video = read_video_decord(uploaded_video, indices)
concatenated_image = concatenate_frames(video, clip_len) # Passed clip_len as argument
processor = AutoProcessor.from_pretrained(model_choice)
model = AutoModel.from_pretrained(model_choice)
activities_list = activities.split(",")
inputs = processor(
text=activities_list,
videos=list(video),
return_tensors="pt",
padding=True,
)
with torch.no_grad():
outputs = model(**inputs)
logits_per_video = outputs.logits_per_video
probs = logits_per_video.softmax(dim=1)
results_probs = []
results_logits = []
for i in range(len(activities_list)):
activity = activities_list[i]
prob = float(probs[0][i])
logit = float(logits_per_video[0][i])
results_probs.append((activity, f"Probability: {prob * 100:.2f}%"))
results_logits.append((activity, f"Raw Score: {logit:.2f}"))
# Retrieve most likely predicted label and its probability
max_prob_idx = probs[0].argmax().item()
most_likely_activity = activities_list[max_prob_idx]
most_likely_prob = float(probs[0][max_prob_idx])
return concatenated_image, results_probs, results_logits, (most_likely_activity, f"Probability: {most_likely_prob * 100:.2f}%")
iface = gr.Interface(
fn=model_interface,
inputs=[
gr.components.Video(label="Upload a video file"),
gr.components.Dropdown(choices=[
"microsoft/xclip-base-patch16-zero-shot",
"microsoft/xclip-base-patch32-16-frames",
"microsoft/xclip-base-patch32"
], label="Model Choice"),
gr.components.Textbox(lines=4, label="Enter activities (comma-separated)"),
],
outputs=[
gr.components.Image(type="pil", label="sampled frames"),
gr.components.Textbox(type="text", label="Probabilities"),
gr.components.Textbox(type="text", label="Raw Scores"),
gr.components.Textbox(type="text", label="Most Likely Prediction")
],
live=False
)
iface.launch() |