import shutil
import gradio as gr
import torch
from fastapi import FastAPI
import os
import tempfile
from Infer import Infer
title_markdown = ("""
Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering
Under review.
""")
block_css = """
#buttons button {
min-width: min(120px,100%);
}
"""
def save_video_to_local(video_path):
filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
shutil.copyfile(video_path, filename)
return filename
def generate(video, textbox_in, candbox_in):
video = video if video else "none"
text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
print(text_en_out)
textbox_out = text_en_out
#torch.cuda.empty_cache()
return textbox_out
device = "cpu"
handler = Infer(device)
# handler.model.to(dtype=dtype)
if not os.path.exists("temp"):
os.makedirs("temp")
#print(torch.cuda.memory_allocated())
#print(torch.cuda.max_memory_allocated())
question_box = gr.Textbox(
show_label=False, placeholder="Enter question", container=False
)
candidates_box = gr.Textbox(
show_label=False, placeholder="Enter a list of options", container=False
)
with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
gr.Markdown(title_markdown)
state = gr.State()
state_ = gr.State()
first_run = gr.State()
images_tensor = gr.State()
with gr.Row():
with gr.Column(scale=3):
video = gr.Video(label="Input Video")
cur_dir = os.path.dirname(os.path.abspath(__file__))
print(cur_dir)
gr.Examples(
examples=[
[
cur_dir + "/videos/3249402410.mp4",
"What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
"['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
],
[
cur_dir + "/videos/4882821564.mp4",
"Why did the boy clap his hands when he ran to the christmas tree?",
"['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
],
[
cur_dir + "/videos/6233408665.mp4",
"What did the people on the sofa do after the lady in pink finished singing?",
"['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
],
],
inputs=[video, question_box, candidates_box],
)
with gr.Column(scale=3):
chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
with gr.Row():
with gr.Column(scale=4):
question_box.render()
with gr.Column(scale=4):
candidates_box.render()
with gr.Column(scale=1, min_width=50):
submit_btn = gr.Button(
value="Send", variant="primary", interactive=True
)
submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])
demo.launch(share=True)