Spaces:

lixin4ever
/

VideoLLaMA2

Running on Zero

App Files Files Community

Update app.py

by youngsheen - opened Oct 22, 2024

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+95

-43

Files changed (1) hide show

app.py +95 -43

app.py CHANGED Viewed

@@ -2,13 +2,12 @@ import spaces
 import os
 import re
-import traceback
 import torch
 import gradio as gr
 import sys
-sys.path.append('./VideoLLaMA2')
 from videollama2 import model_init, mm_infer
 from videollama2.utils import disable_torch_init
@@ -98,7 +97,7 @@ class Chat:
 @spaces.GPU(duration=120)
-def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
     data = []
     processor = handler.processor
@@ -106,7 +105,15 @@ def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max
         if image is not None:
             data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
         elif video is not None:
-            data.append((processor['video'](video).to(handler.model.device, dtype=dtype), '<video>'))
         elif image is None and video is None:
             data.append((None, '<text>'))
         else:
@@ -122,6 +129,8 @@ def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max
         show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
     if video is not None:
         show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={video}"></video>'
     one_turn_chat = [textbox_in, None]
@@ -130,35 +139,50 @@ def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max
         one_turn_chat[0] += "\n" + show_images
     # 2. not first run case
     else:
-        # scanning the last image or video
-        length = len(chatbot)
-        for i in range(length - 1, -1, -1):
-            previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[i][0])
-            previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;"  src="./file=(.+?)"', chatbot[i][0])
-            if len(previous_image) > 0:
-                previous_image = previous_image[-1]
-                # 2.1 new image append or pure text input will start a new conversation
-                if (video is not None) or (image is not None and os.path.basename(previous_image) != os.path.basename(image)):
-                    message.clear()
-                    one_turn_chat[0] += "\n" + show_images
-                break
-            elif len(previous_video) > 0:
-                previous_video = previous_video[-1]
-                # 2.2 new video append or pure text input will start a new conversation
-                if image is not None or (video is not None and os.path.basename(previous_video) != os.path.basename(video)):
-                    message.clear()
-                    one_turn_chat[0] += "\n" + show_images
-                break
     message.append({'role': 'user', 'content': textbox_in})
     text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
     message.append({'role': 'assistant', 'content': text_en_out})
     one_turn_chat[1] = text_en_out
     chatbot.append(one_turn_chat)
-    return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), message, chatbot
 def regenerate(message, chatbot):
@@ -170,6 +194,7 @@ def regenerate(message, chatbot):
 def clear_history(message, chatbot):
     message.clear(), chatbot.clear()
     return (gr.update(value=None, interactive=True),
             gr.update(value=None, interactive=True),
             message, chatbot,
             gr.update(value=None, interactive=True))
@@ -180,9 +205,9 @@ def clear_history(message, chatbot):
 # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
 # 3. The function can't return tensor or other cuda objects.
-model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F'
-handler = Chat(model_path, load_8bit=False, load_4bit=True)
 textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
@@ -194,6 +219,7 @@ theme.set(block_label_text_color="#9C276A")
 theme.set(button_primary_text_color="#9C276A")
 # theme.set(button_secondary_text_color="*neutral_800")
 with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
     gr.Markdown(title_markdown)
     message = gr.State([])
@@ -202,6 +228,7 @@ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as
         with gr.Column(scale=3):
             image = gr.Image(label="Input Image", type="filepath")
             video = gr.Video(label="Input Video")
             with gr.Accordion("Parameters", open=True) as parameter_row:
                 # num_beams = gr.Slider(
@@ -213,6 +240,8 @@ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as
                 #     label="beam search numbers",
                 # )
                 temperature = gr.Slider(
                     minimum=0.1,
                     maximum=1.0,
@@ -256,8 +285,9 @@ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as
                 clear_btn      = gr.Button(value="🗑️  Clear history", interactive=True)
     with gr.Row():
         with gr.Column():
-            cur_dir = os.path.dirname(os.path.abspath(__file__))
             gr.Examples(
                 examples=[
                     [
@@ -268,51 +298,73 @@ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as
                         f"{cur_dir}/examples/waterview.jpg",
                         "What are the things I should be cautious about when I visit here?",
                     ],
-                    [
-                        f"{cur_dir}/examples/desert.jpg",
-                        "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?",
-                    ],
                 ],
                 inputs=[image, textbox],
             )
         with gr.Column():
             gr.Examples(
                 examples=[
                     [
-                        f"{cur_dir}/examples/rap.mp4",
-                        "What happens in this video?",
                     ],
                     [
-                        f"{cur_dir}/examples/demo2.mp4",
-                        "Do you think it's morning or night in this video? Why?",
                     ],
                     [
-                        f"{cur_dir}/examples/demo3.mp4",
-                        "At the intersection, in which direction does the red car turn?",
                     ],
                 ],
                 inputs=[video, textbox],
             )
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     submit_btn.click(
         generate,
-        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
-        [image, video, message, chatbot])
     regenerate_btn.click(
         regenerate,
         [message, chatbot],
         [message, chatbot]).then(
         generate,
-        [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
-        [image, video, message, chatbot])
     clear_btn.click(
         clear_history,
         [message, chatbot],
-        [image, video, message, chatbot, textbox])
 demo.launch()

 import os
 import re
 import torch
 import gradio as gr
 import sys
+sys.path.append('./')
 from videollama2 import model_init, mm_infer
 from videollama2.utils import disable_torch_init
 @spaces.GPU(duration=120)
+def generate(image, video, audio, message, chatbot, va_tag, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
     data = []
     processor = handler.processor
         if image is not None:
             data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
         elif video is not None:
+            video_audio = processor['video'](video, va=va_tag=="Audio Vision")
+            if va_tag=="Audio Vision":
+                for k,v in video_audio.items():
+                    video_audio[k] = v.to(handler.model.device, dtype=dtype)
+            else:
+                video_audio = video_audio.to(handler.model.device, dtype=dtype)
+            data.append((video_audio, '<video>'))
+        elif audio is not None:
+            data.append((processor['audio'](audio).to(handler.model.device, dtype=dtype), '<audio>'))
         elif image is None and video is None:
             data.append((None, '<text>'))
         else:
         show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
     if video is not None:
         show_images += f'<video controls playsinline width="500" style="display: inline-block;"  src="./file={video}"></video>'
+    if audio is not None:
+        show_images += f'<audio controls style="display: inline-block;" src="./file={audio}"></audio>'
     one_turn_chat = [textbox_in, None]
         one_turn_chat[0] += "\n" + show_images
     # 2. not first run case
     else:
+        previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[0][0])
+        previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;"  src="./file=(.+?)"', chatbot[0][0])
+        previous_audio = re.findall(r'<audio controls style="display: inline-block;" src="./file=(.+?)"', chatbot[0][0])
+        if len(previous_image) > 0:
+            previous_image = previous_image[0]
+            # 2.1 new image append or pure text input will start a new conversation
+            if image is not None and os.path.basename(previous_image) != os.path.basename(image):
+                message.clear()
+                one_turn_chat[0] += "\n" + show_images
+        elif len(previous_video) > 0:
+            previous_video = previous_video[0]
+            # 2.2 new video append or pure text input will start a new conversation
+            if video is not None and os.path.basename(previous_video) != os.path.basename(video):
+                message.clear()
+                one_turn_chat[0] += "\n" + show_images
+        elif len(previous_audio) > 0:
+            previous_audio = previous_audio[0]
+            # 2.3 new audio append or pure text input will start a new conversation
+            if audio is not None and os.path.basename(previous_audio) != os.path.basename(video):
+                message.clear()
+                one_turn_chat[0] += "\n" + show_images
     message.append({'role': 'user', 'content': textbox_in})
+    if va_tag == "Vision Only":
+        audio_tower = handler.model.model.audio_tower
+        handler.model.model.audio_tower = None
+    elif va_tag == "Audio Only":
+        vision_tower = handler.model.model.vision_tower
+        handler.model.model.vision_tower = None
     text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
+    if va_tag == "Vision Only":
+        handler.model.model.audio_tower = audio_tower
+    elif va_tag == "Audio Only":
+        handler.model.model.vision_tower = vision_tower
     message.append({'role': 'assistant', 'content': text_en_out})
     one_turn_chat[1] = text_en_out
     chatbot.append(one_turn_chat)
+    return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), gr.update(value=audio, interactive=True), message, chatbot
 def regenerate(message, chatbot):
 def clear_history(message, chatbot):
     message.clear(), chatbot.clear()
     return (gr.update(value=None, interactive=True),
+            gr.update(value=None, interactive=True),
             gr.update(value=None, interactive=True),
             message, chatbot,
             gr.update(value=None, interactive=True))
 # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
 # 3. The function can't return tensor or other cuda objects.
+model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-AV'
+handler = Chat(model_path, load_8bit=False, load_4bit=False)
 textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
 theme.set(button_primary_text_color="#9C276A")
 # theme.set(button_secondary_text_color="*neutral_800")
 with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
     gr.Markdown(title_markdown)
     message = gr.State([])
         with gr.Column(scale=3):
             image = gr.Image(label="Input Image", type="filepath")
             video = gr.Video(label="Input Video")
+            audio = gr.Audio(label="Input Audio", type="filepath")
             with gr.Accordion("Parameters", open=True) as parameter_row:
                 # num_beams = gr.Slider(
                 #     label="beam search numbers",
                 # )
+                va_tag = gr.Radio(choices=["Audio Vision", "Vision Only", "Audio Only"], value="Audio Vision", label="Select one")
                 temperature = gr.Slider(
                     minimum=0.1,
                     maximum=1.0,
                 clear_btn      = gr.Button(value="🗑️  Clear history", interactive=True)
     with gr.Row():
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
         with gr.Column():
             gr.Examples(
                 examples=[
                     [
                         f"{cur_dir}/examples/waterview.jpg",
                         "What are the things I should be cautious about when I visit here?",
                     ],
                 ],
                 inputs=[image, textbox],
             )
         with gr.Column():
             gr.Examples(
                 examples=[
                     [
+                        f"{cur_dir}/examples/WBS4I.mp4",
+                        "Please describe the video:",
+                    ],
+                    [
+                        f"{cur_dir}/examples/sample_demo_1.mp4",
+                        "Please describe the video:",
                     ],
+                ],
+                inputs=[video, textbox],
+            )
+        with gr.Column():
+            gr.Examples(
+                examples=[
                     [
+                        f"{cur_dir}/examples/00000368.mp4",
+                        "Where is the loudest instrument?",
                     ],
                     [
+                        f"{cur_dir}/examples/00003491.mp4",
+                        "Is the instrument on the left louder than the instrument on the right?",
                     ],
                 ],
                 inputs=[video, textbox],
             )
+        with gr.Column():
+            # audio
+            gr.Examples(
+                examples=[
+                    [
+                        f"{cur_dir}/examples/Y--ZHUMfueO0.flac",
+                        "Please describe the audio:",
+                    ],
+                    [
+                        f"{cur_dir}/examples/Traffic and pedestrians.wav",
+                        "Please describe the audio:",
+                    ],
+                ],
+                inputs=[audio, textbox],
+            )
     gr.Markdown(tos_markdown)
     gr.Markdown(learn_more_markdown)
     submit_btn.click(
         generate,
+        [image, video, audio, message, chatbot, va_tag, textbox, temperature, top_p, max_output_tokens],
+        [image, video, audio, message, chatbot])
     regenerate_btn.click(
         regenerate,
         [message, chatbot],
         [message, chatbot]).then(
         generate,
+        [image, video, audio, message, chatbot, va_tag, textbox, temperature, top_p, max_output_tokens],
+        [image, video, audio, message, chatbot])
     clear_btn.click(
         clear_history,
         [message, chatbot],
+        [image, video, audio, message, chatbot, textbox])
 demo.launch()