File size: 4,021 Bytes
56de2d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import gradio as gr
import torch
import numpy as np
from transformers import AutoProcessor, AutoModel
from PIL import Image
from decord import VideoReader, cpu

def sample_uniform_frame_indices(clip_len, seg_len):
    """
    Samples `clip_len` uniformly spaced frame indices from a video of length `seg_len`.
    Handles edge cases where `seg_len` might be less than `clip_len`.
    """
    if seg_len < clip_len:
        repeat_factor = np.ceil(clip_len / seg_len).astype(int)
        indices = np.arange(seg_len).tolist() * repeat_factor
        indices = indices[:clip_len]
    else:
        spacing = seg_len // clip_len
        indices = [i * spacing for i in range(clip_len)]
    
    return np.array(indices).astype(np.int64)

def read_video_decord(file_path, indices):
    vr = VideoReader(file_path, num_threads=1, ctx=cpu(0))
    video = vr.get_batch(indices).asnumpy()
    return video

def concatenate_frames(frames, clip_len):
    assert len(frames) == clip_len, f"The function expects {clip_len} frames as input."
    
    layout = {
        32: (4, 8),
        16: (4, 4),
        8:  (2, 4)
    }
    rows, cols = layout[clip_len]
    
    combined_image = Image.new('RGB', (frames[0].shape[1]*cols, frames[0].shape[0]*rows))
    frame_iter = iter(frames)
    y_offset = 0
    for i in range(rows):
        x_offset = 0
        for j in range(cols):
            img = Image.fromarray(next(frame_iter))
            combined_image.paste(img, (x_offset, y_offset))
            x_offset += frames[0].shape[1]
        y_offset += frames[0].shape[0]
        
    return combined_image


def model_interface(uploaded_video, model_choice, activities):
    clip_len = {
        "microsoft/xclip-base-patch16-zero-shot": 32,
        "microsoft/xclip-base-patch32-16-frames": 16,
        "microsoft/xclip-base-patch32": 8
    }.get(model_choice, 32)
    
    indices = sample_uniform_frame_indices(clip_len, seg_len=len(VideoReader(uploaded_video)))
    video = read_video_decord(uploaded_video, indices)
    concatenated_image = concatenate_frames(video, clip_len)  # Passed clip_len as argument

    processor = AutoProcessor.from_pretrained(model_choice)
    model = AutoModel.from_pretrained(model_choice)
    
    activities_list = activities.split(",")
    inputs = processor(
        text=activities_list,
        videos=list(video),
        return_tensors="pt",
        padding=True,
    )

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_video = outputs.logits_per_video
    probs = logits_per_video.softmax(dim=1)

    results_probs = []
    results_logits = []
    for i in range(len(activities_list)):
        activity = activities_list[i]
        prob = float(probs[0][i])
        logit = float(logits_per_video[0][i])
        results_probs.append((activity, f"Probability: {prob * 100:.2f}%"))
        results_logits.append((activity, f"Raw Score: {logit:.2f}"))

    # Retrieve most likely predicted label and its probability
    max_prob_idx = probs[0].argmax().item()
    most_likely_activity = activities_list[max_prob_idx]
    most_likely_prob = float(probs[0][max_prob_idx])

    return concatenated_image, results_probs, results_logits, (most_likely_activity, f"Probability: {most_likely_prob * 100:.2f}%")

iface = gr.Interface(
    fn=model_interface,
    inputs=[
        gr.components.Video(label="Upload a video file"),
        gr.components.Dropdown(choices=[
            "microsoft/xclip-base-patch16-zero-shot",
            "microsoft/xclip-base-patch32-16-frames",
            "microsoft/xclip-base-patch32"
        ], label="Model Choice"),
        gr.components.Textbox(lines=4, label="Enter activities (comma-separated)"),
    ],
    outputs=[
        gr.components.Image(type="pil", label="sampled frames"),
        gr.components.Textbox(type="text", label="Probabilities"),
        gr.components.Textbox(type="text", label="Raw Scores"),
        gr.components.Textbox(type="text", label="Most Likely Prediction")
    ],
    live=False
)

iface.launch()