Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering

import shutil
import gradio as gr
import torch
from fastapi import FastAPI
import os
import tempfile
from Infer import Infer

title_markdown = ("""
<div style="display: flex; justify-content: center; align-items: center; text-align: center;">
  <div>
    <h1 >Temporal-guided Mixture-of-Experts for Zero-Shot Video Question Answering</h1>
    <h5 style="margin: 0;">Under review.</h5>
  </div>
</div>
                  
<div align="center">
    <div style="display:flex; gap: 0.25rem;" align="center">
        <a href='https://github.com/qyx1121/T-MoENet'><img src='https://img.shields.io/badge/Github-Code-blue'></a>
    </div>
</div>
""")

block_css = """
#buttons button {
    min-width: min(120px,100%);
}
"""

def save_video_to_local(video_path):
    filename = os.path.join('temp', next(tempfile._get_candidate_names()) + '.mp4')
    shutil.copyfile(video_path, filename)
    return filename


def generate(video, textbox_in, candbox_in):
    video = video if video else "none"

    text_en_out = handler.generate(textbox_in, eval(candbox_in), video)
    print(text_en_out)
    textbox_out = text_en_out
    #torch.cuda.empty_cache()
    return textbox_out


device = "cpu"
handler = Infer(device)
# handler.model.to(dtype=dtype)
if not os.path.exists("temp"):
    os.makedirs("temp")

#print(torch.cuda.memory_allocated())
#print(torch.cuda.max_memory_allocated())

question_box = gr.Textbox(
    show_label=False, placeholder="Enter question", container=False
    )

candidates_box = gr.Textbox(
    show_label=False, placeholder="Enter a list of options", container=False
    )

with gr.Blocks(title='T-MoENet', theme=gr.themes.Default(), css=block_css) as demo:
    gr.Markdown(title_markdown)
    state = gr.State()
    state_ = gr.State()
    first_run = gr.State()
    images_tensor = gr.State()

    with gr.Row():
        with gr.Column(scale=3):
            video = gr.Video(label="Input Video")
            cur_dir = os.path.dirname(os.path.abspath(__file__))
            print(cur_dir)
            gr.Examples(
                examples=[
                    [
                        cur_dir + "/videos/3249402410.mp4",
                        "What did the lady in black on the left do after she finished spreading the sauce on her pizza?",
                            "['slice the pizza', 'cut the meat', 'point', 'put cheese', 'put on plate']"
                    ],
                    [
                        cur_dir + "/videos/4882821564.mp4",
                        "Why did the boy clap his hands when he ran to the christmas tree?",
                        "['adjust the tree', 'get away the dust', 'dancing', 'pressed a button to activate', 'presents']"
                    ],
                    [
                        cur_dir + "/videos/6233408665.mp4",
                        "What did the people on the sofa do after the lady in pink finished singing?",
                        "['sitting', 'give it to the girl', 'take music sheet', 'clap', 'walk in circles']"
                    ],
                ],
                inputs=[video, question_box, candidates_box],
            )

        with gr.Column(scale=3):
            chatbot = gr.Chatbot(label="T-MoENet", bubble_full_width=True)
            with gr.Row():
                with gr.Column(scale=4):
                    question_box.render()
                with gr.Column(scale=4): 
                    candidates_box.render()
                with gr.Column(scale=1, min_width=50):
                    submit_btn = gr.Button(
                        value="Send", variant="primary", interactive=True
                    )

    submit_btn.click(generate, [video, question_box, candidates_box], [chatbot])

demo.launch(share=True)