Aliayub1995 commited on
Commit
87ce8f2
·
verified ·
1 Parent(s): 122bb7d

Upload 52 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +2 -0
  2. handler.py +82 -0
  3. videollama2/__init__.py +117 -0
  4. videollama2/constants.py +32 -0
  5. videollama2/conversation.py +507 -0
  6. videollama2/eval/eval_video_cap_msvc_correctness.py +259 -0
  7. videollama2/eval/eval_video_cap_msvc_detailedness.py +257 -0
  8. videollama2/eval/eval_video_mcqa_mvbench.py +64 -0
  9. videollama2/eval/eval_video_mcqa_videomme.py +277 -0
  10. videollama2/eval/eval_video_oqa_activitynet.py +210 -0
  11. videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py +210 -0
  12. videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py +210 -0
  13. videollama2/eval/eval_video_oqa_vcgpt_3_context.py +212 -0
  14. videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py +206 -0
  15. videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py +218 -0
  16. videollama2/eval/inference_video_cap_msvc.py +120 -0
  17. videollama2/eval/inference_video_mcqa_egoschema.py +153 -0
  18. videollama2/eval/inference_video_mcqa_mvbench.py +203 -0
  19. videollama2/eval/inference_video_mcqa_perception_test_mcqa.py +169 -0
  20. videollama2/eval/inference_video_mcqa_videomme.py +304 -0
  21. videollama2/eval/inference_video_oqa_activitynet.py +146 -0
  22. videollama2/eval/inference_video_oqa_vcgpt_consistency.py +150 -0
  23. videollama2/eval/inference_video_oqa_vcgpt_general.py +130 -0
  24. videollama2/mm_utils.py +357 -0
  25. videollama2/model/__init__.py +224 -0
  26. videollama2/model/encoder.py +188 -0
  27. videollama2/model/projector.py +250 -0
  28. videollama2/model/videollama2_arch.py +264 -0
  29. videollama2/model/videollama2_gemma2.py +176 -0
  30. videollama2/model/videollama2_llama.py +157 -0
  31. videollama2/model/videollama2_mistral.py +159 -0
  32. videollama2/model/videollama2_mixtral.py +154 -0
  33. videollama2/model/videollama2_phi3.py +159 -0
  34. videollama2/model/videollama2_qwen2.py +153 -0
  35. videollama2/serve/cli.py +139 -0
  36. videollama2/serve/controller.py +298 -0
  37. videollama2/serve/examples/1034346401.mp4 +3 -0
  38. videollama2/serve/examples/desert.jpg +0 -0
  39. videollama2/serve/examples/extreme_ironing.jpg +0 -0
  40. videollama2/serve/examples/sample_demo_1.mp4 +3 -0
  41. videollama2/serve/examples/sample_demo_3.mp4 +0 -0
  42. videollama2/serve/examples/sample_demo_9.mp4 +0 -0
  43. videollama2/serve/examples/waterview.jpg +0 -0
  44. videollama2/serve/gradio_web_server.py +499 -0
  45. videollama2/serve/gradio_web_server_adhoc.py +312 -0
  46. videollama2/serve/model_worker.py +397 -0
  47. videollama2/serve/register_worker.py +26 -0
  48. videollama2/serve/sglang_worker.py +244 -0
  49. videollama2/serve/test_message.py +62 -0
  50. videollama2/train.py +585 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ videollama2/serve/examples/1034346401.mp4 filter=lfs diff=lfs merge=lfs -text
37
+ videollama2/serve/examples/sample_demo_1.mp4 filter=lfs diff=lfs merge=lfs -text
handler.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List, Any
2
+ import sys
3
+ sys.path.append('./')
4
+ from videollama2 import model_init, mm_infer
5
+ from videollama2.utils import disable_torch_init
6
+
7
+ class EndpointHandler:
8
+ def __init__(self, path: str = ""):
9
+ """
10
+ Initialize the handler by loading the model and any other necessary components.
11
+
12
+ Args:
13
+ path (str): The path to the model or other necessary files.
14
+ """
15
+ disable_torch_init()
16
+ self.model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B'
17
+ self.model, self.processor, self.tokenizer = model_init(self.model_path)
18
+
19
+ def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
20
+ """
21
+ Handle inference requests.
22
+
23
+ Args:
24
+ data (Dict[str, Any]): The input data for inference. Expected keys:
25
+ - 'modal' (str): 'video' or 'image'
26
+ - 'modal_path' (str): Path to the video or image file
27
+ - 'instruct' (str): The instruction/query to process
28
+
29
+ Returns:
30
+ List[Dict[str, Any]]: The output of the inference.
31
+ """
32
+ modal = data.get("modal", "video")
33
+ modal_path = data.get("modal_path", "")
34
+ instruct = data.get("instruct", "")
35
+
36
+ if not modal_path or not instruct:
37
+ raise ValueError("Both 'modal_path' and 'instruct' must be provided in the input data.")
38
+
39
+ # Perform inference
40
+ output = mm_infer(
41
+ self.processor[modal](modal_path),
42
+ instruct,
43
+ model=self.model,
44
+ tokenizer=self.tokenizer,
45
+ do_sample=False,
46
+ modal=modal
47
+ )
48
+
49
+ return [{"output": output}]
50
+
51
+
52
+ # from transformers import pipeline
53
+
54
+ # class EndpointHandler:
55
+ # def __init__(self, path: str = ""):
56
+ # """
57
+ # Initialize the handler by setting up the environment and loading the model.
58
+ # """
59
+ # # Use a pipeline as a high-level helper to download and load the model
60
+ # self.pipe = pipeline("visual-question-answering", model="DAMO-NLP-SG/VideoLLaMA2-8x7B")
61
+ # print("Model downloaded and pipeline created successfully.")
62
+
63
+ # def __call__(self, data):
64
+ # """
65
+ # Handle inference requests.
66
+
67
+ # Args:
68
+ # data (dict): Input data containing 'image' and 'question'.
69
+
70
+ # Returns:
71
+ # dict: The output from the model.
72
+ # """
73
+ # image = data.get("image")
74
+ # question = data.get("question")
75
+
76
+ # if not image or not question:
77
+ # raise ValueError("Both 'image' and 'question' must be provided in the input data.")
78
+
79
+ # # Use the pipeline to perform visual question answering
80
+ # output = self.pipe(image=image, question=question)
81
+
82
+ # return output
videollama2/__init__.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import copy
3
+ import warnings
4
+ import shutil
5
+ from functools import partial
6
+
7
+ import torch
8
+ import logging
9
+ from .model import load_pretrained_model
10
+ from .mm_utils import process_image, process_video, tokenizer_multimodal_token, get_model_name_from_path, KeywordsStoppingCriteria
11
+ from .constants import NUM_FRAMES, DEFAULT_IMAGE_TOKEN, DEFAULT_VIDEO_TOKEN, MODAL_INDEX_MAP
12
+
13
+
14
+ def model_init(model_path=None, **kwargs):
15
+ logging.info(f"Loading Model from {model_path}")
16
+ model_path = "DAMO-NLP-SG/VideoLLaMA2-7B" if model_path is None else model_path
17
+ logging.info(f"Model Path: {model_path}")
18
+ model_name = get_model_name_from_path(model_path)
19
+ logging.info(f"Model Name: {model_name}")
20
+ tokenizer, model, processor, context_len = load_pretrained_model(model_path, None, model_name, **kwargs)
21
+ logging.info(f"Model Loaded Successfully")
22
+ if tokenizer.pad_token is None and tokenizer.unk_token is not None:
23
+ tokenizer.pad_token = tokenizer.unk_token
24
+
25
+ num_frames = model.config.num_frames if hasattr(model.config, "num_frames") else NUM_FRAMES
26
+
27
+ processor = {
28
+ 'image': partial(process_image, processor=processor, aspect_ratio=None),
29
+ 'video': partial(process_video, processor=processor, aspect_ratio=None, num_frames=num_frames),
30
+ }
31
+
32
+ return model, processor, tokenizer
33
+
34
+
35
+ def mm_infer(image_or_video, instruct, model, tokenizer, modal='video', **kwargs):
36
+ """inference api of VideoLLaMA2 for video understanding.
37
+
38
+ Args:
39
+ model: VideoLLaMA2 model.
40
+ image_or_video (torch.Tensor): image tensor (1, C, H, W) / video tensor (T, C, H, W).
41
+ instruct (str): text instruction for understanding video.
42
+ tokenizer: tokenizer.
43
+ do_sample (bool): whether to sample.
44
+ modal (str): inference modality.
45
+ Returns:
46
+ str: response of the model.
47
+ """
48
+
49
+ # 1. text preprocess (tag process & generate prompt).
50
+ if modal == 'image':
51
+ modal_token = DEFAULT_IMAGE_TOKEN
52
+ elif modal == 'video':
53
+ modal_token = DEFAULT_VIDEO_TOKEN
54
+ elif modal == 'text':
55
+ modal_token = ''
56
+ else:
57
+ raise ValueError(f"Unsupported modal: {modal}")
58
+
59
+ # 1. vision preprocess (load & transform image or video).
60
+ if modal == 'text':
61
+ tensor = None
62
+ else:
63
+ tensor = image_or_video.half().cuda()
64
+ tensor = [(tensor, modal)]
65
+
66
+ # 2. text preprocess (tag process & generate prompt).
67
+ if isinstance(instruct, str):
68
+ message = [{'role': 'user', 'content': modal_token + '\n' + instruct}]
69
+ elif isinstance(instruct, list):
70
+ message = copy.deepcopy(instruct)
71
+ message[0]['content'] = modal_token + '\n' + message[0]['content']
72
+ else:
73
+ raise ValueError(f"Unsupported type of instruct: {type(instruct)}")
74
+
75
+ if model.config.model_type in ['videollama2', 'videollama2_mistral', 'videollama2_mixtral']:
76
+ system_message = [
77
+ {'role': 'system', 'content': (
78
+ """<<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature."""
79
+ """\n"""
80
+ """If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.\n<</SYS>>""")
81
+ }
82
+ ]
83
+ else:
84
+ system_message = []
85
+
86
+ message = system_message + message
87
+ prompt = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=True)
88
+
89
+ input_ids = tokenizer_multimodal_token(prompt, tokenizer, modal_token, return_tensors='pt').unsqueeze(0).long().cuda()
90
+ attention_masks = input_ids.ne(tokenizer.pad_token_id).long().cuda()
91
+
92
+ # 3. generate response according to visual signals and prompts.
93
+ keywords = [tokenizer.eos_token]
94
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
95
+
96
+ do_sample = kwargs.get('do_sample', False)
97
+ temperature = kwargs.get('temperature', 0.2 if do_sample else 0.0)
98
+ top_p = kwargs.get('top_p', 0.9)
99
+ max_new_tokens = kwargs.get('max_new_tokens', 2048)
100
+
101
+ with torch.inference_mode():
102
+ output_ids = model.generate(
103
+ input_ids,
104
+ attention_mask=attention_masks,
105
+ images=tensor,
106
+ do_sample=do_sample,
107
+ temperature=temperature,
108
+ max_new_tokens=max_new_tokens,
109
+ top_p=top_p,
110
+ use_cache=True,
111
+ stopping_criteria=[stopping_criteria],
112
+ pad_token_id=tokenizer.eos_token_id,
113
+ )
114
+
115
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
116
+
117
+ return outputs
videollama2/constants.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "."
5
+
6
+ # Model Constants
7
+ IGNORE_INDEX = -100
8
+
9
+ # Image arguments
10
+ IMAGE_TOKEN_INDEX = -200
11
+ DEFAULT_IMAGE_TOKEN = "<image>"
12
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13
+ DEFAULT_IM_START_TOKEN = "<im_start>"
14
+ DEFAULT_IM_END_TOKEN = "<im_end>"
15
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
16
+
17
+ # Video arguments
18
+ VIDEO_TOKEN_INDEX = -201
19
+ DEFAULT_VIDEO_TOKEN = "<video>"
20
+ NUM_FRAMES = 8
21
+ MAX_FRAMES = 32
22
+ NUM_FRAMES_PER_SECOND = 1
23
+
24
+ # Audio arguments
25
+ AUDIO_TOKEN_INDEX = -202
26
+ DEFAULT_AUDIO_TOKEN = "<audio>"
27
+
28
+ MODAL_INDEX_MAP = {
29
+ "<image>": -200,
30
+ "<video>": -201,
31
+ "<audio>": -202,
32
+ }
videollama2/conversation.py ADDED
@@ -0,0 +1,507 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import dataclasses
3
+ from io import BytesIO
4
+ from enum import auto, Enum
5
+ from typing import List, Tuple
6
+
7
+ from PIL import Image
8
+ from .constants import LOGDIR, NUM_FRAMES
9
+
10
+
11
+ class SeparatorStyle(Enum):
12
+ """Different separator style."""
13
+ SINGLE = auto()
14
+ TWO = auto()
15
+ PLAIN = auto()
16
+ LLAMA2 = auto()
17
+ QWEN = auto()
18
+
19
+ @dataclasses.dataclass
20
+ class Conversation:
21
+ """A class that keeps all conversation history."""
22
+ system: str
23
+ roles: List[str]
24
+ messages: List[List[str]]
25
+ offset: int
26
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
27
+ sep: str = "###"
28
+ sep2: str = None
29
+ version: str = "Unknown"
30
+
31
+ skip_next: bool = False
32
+ modality: str = "image"
33
+
34
+ def get_prompt(self):
35
+ messages = self.messages
36
+ modality_token = f"<{self.modality}>"
37
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
38
+ messages = self.messages.copy()
39
+ init_role, init_msg = messages[0].copy()
40
+ init_msg = init_msg[0].replace(modality_token, "").strip()
41
+ if 'mmtag' in self.version:
42
+ messages[0] = (init_role, init_msg)
43
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
44
+ messages.insert(1, (self.roles[1], "Received."))
45
+ else:
46
+ messages[0] = (init_role, f"{modality_token}\n" + init_msg)
47
+
48
+ if self.sep_style == SeparatorStyle.SINGLE:
49
+ ret = self.system + self.sep
50
+ for role, message in messages:
51
+ if message:
52
+ if type(message) is tuple:
53
+ message, _, _ = message
54
+ ret += role + ": " + message + self.sep
55
+ else:
56
+ ret += role + ":"
57
+ elif self.sep_style == SeparatorStyle.TWO:
58
+ seps = [self.sep, self.sep2]
59
+ ret = self.system + seps[0]
60
+ for i, (role, message) in enumerate(messages):
61
+ if message:
62
+ if type(message) is tuple:
63
+ message, _, _ = message
64
+ ret += role + ": " + message + seps[i % 2]
65
+ else:
66
+ ret += role + ":"
67
+ elif self.sep_style == SeparatorStyle.LLAMA2:
68
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n"
69
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
70
+ ret = ""
71
+
72
+ for i, (role, message) in enumerate(messages):
73
+ if i == 0:
74
+ assert message, "first message should not be none"
75
+ assert role == self.roles[0], "first message should come from user"
76
+ if message:
77
+ if type(message) is tuple:
78
+ message, _, _ = message
79
+ if i == 0: message = wrap_sys(self.system) + message
80
+ if i % 2 == 0:
81
+ message = wrap_inst(message)
82
+ ret += self.sep + message
83
+ else:
84
+ ret += " " + message + " " + self.sep2
85
+ else:
86
+ ret += ""
87
+ ret = ret.lstrip(self.sep)
88
+ elif self.sep_style == SeparatorStyle.QWEN:
89
+ ret = ""
90
+ # 1. Add system prompt
91
+ ret += self.system + self.sep + "\n"
92
+ # 2. Iterate message
93
+ for i, (role, message) in enumerate(messages):
94
+ if i == 0:
95
+ assert message, "first message should not be none"
96
+ assert role == self.roles[0], "first message should come from user"
97
+ if message:
98
+ if type(message) is tuple:
99
+ message, _, _ = message
100
+ # 2.1 Add role and message
101
+ ret += role + message + self.sep + "\n"
102
+ else:
103
+ # 2.2 Add generation prompt
104
+ ret += role
105
+ elif self.sep_style == SeparatorStyle.PLAIN:
106
+ seps = [self.sep, self.sep2]
107
+ ret = self.system
108
+ for i, (role, message) in enumerate(messages):
109
+ if message:
110
+ if type(message) is tuple:
111
+ message, _, _ = message
112
+ ret += role + message + seps[i % 2]
113
+ else:
114
+ ret += role
115
+ else:
116
+ raise ValueError(f"Invalid style: {self.sep_style}")
117
+
118
+ return ret
119
+
120
+ def append_message(self, role, message):
121
+ self.messages.append([role, message])
122
+
123
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=800, min_len=400):
124
+ if image_process_mode == "Pad":
125
+ def expand2square(pil_img, background_color=(122, 116, 104)):
126
+ width, height = pil_img.size
127
+ if width == height:
128
+ return pil_img
129
+ elif width > height:
130
+ result = Image.new(pil_img.mode, (width, width), background_color)
131
+ result.paste(pil_img, (0, (width - height) // 2))
132
+ return result
133
+ else:
134
+ result = Image.new(pil_img.mode, (height, height), background_color)
135
+ result.paste(pil_img, ((height - width) // 2, 0))
136
+ return result
137
+ image = expand2square(image)
138
+ elif image_process_mode in ["Default", "Crop"]:
139
+ pass
140
+ elif image_process_mode == "Resize":
141
+ image = image.resize((336, 336))
142
+ else:
143
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
144
+ if max(image.size) > max_len:
145
+ max_hw, min_hw = max(image.size), min(image.size)
146
+ aspect_ratio = max_hw / min_hw
147
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
148
+ longest_edge = int(shortest_edge * aspect_ratio)
149
+ W, H = image.size
150
+ if H > W:
151
+ H, W = longest_edge, shortest_edge
152
+ else:
153
+ H, W = shortest_edge, longest_edge
154
+ image = image.resize((W, H))
155
+ if return_pil:
156
+ return image
157
+ else:
158
+ buffered = BytesIO()
159
+ image.save(buffered, format=image_format)
160
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
161
+ return img_b64_str
162
+
163
+
164
+ def get_videos(self, return_pil=False):
165
+ video_frames = []
166
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
167
+ if i % 2 == 0:
168
+ if type(msg) is tuple:
169
+ from decord import VideoReader, cpu
170
+ import numpy as np
171
+ # here video is the file path of input video
172
+ msg, video, image_process_mode = msg
173
+ if not return_pil:
174
+ # return filepath
175
+ video_frames.append(video)
176
+ else:
177
+ # read video using decord.VideoReader
178
+ decord_vr = VideoReader(uri=video, ctx=cpu(0))
179
+ duration = len(decord_vr)
180
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
181
+ # convert the extracted image frames into PIL objects
182
+ all_images = [Image.fromarray(f) for f in decord_vr.get_batch(frame_id_list).asnumpy()]
183
+ video_frames.extend([self.process_image(image, image_process_mode, return_pil=return_pil) for image in all_images])
184
+ return video_frames
185
+
186
+
187
+ def get_images(self, return_pil=False):
188
+ images = []
189
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
190
+ if i % 2 == 0:
191
+ if type(msg) is tuple:
192
+ msg, image, image_process_mode = msg
193
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
194
+ images.append(image)
195
+
196
+ # import base64
197
+ # from io import BytesIO
198
+ # from PIL import Image
199
+ # # here image is a PIL object
200
+ # msg, image, image_process_mode = msg
201
+ # if image_process_mode == "Pad":
202
+ # def expand2square(pil_img, background_color=(122, 116, 104)):
203
+ # width, height = pil_img.size
204
+ # if width == height:
205
+ # return pil_img
206
+ # elif width > height:
207
+ # result = Image.new(pil_img.mode, (width, width), background_color)
208
+ # result.paste(pil_img, (0, (width - height) // 2))
209
+ # return result
210
+ # else:
211
+ # result = Image.new(pil_img.mode, (height, height), background_color)
212
+ # result.paste(pil_img, ((height - width) // 2, 0))
213
+ # return result
214
+ # image = expand2square(image)
215
+ # elif image_process_mode in ["Default", "Crop"]:
216
+ # pass
217
+ # elif image_process_mode == "Resize":
218
+ # image = image.resize((336, 336))
219
+ # else:
220
+ # raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
221
+ # max_hw, min_hw = max(image.size), min(image.size)
222
+ # aspect_ratio = max_hw / min_hw
223
+ # max_len, min_len = 800, 400
224
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
225
+ # longest_edge = int(shortest_edge * aspect_ratio)
226
+ # W, H = image.size
227
+ # if longest_edge != max(image.size):
228
+ # if H > W:
229
+ # H, W = longest_edge, shortest_edge
230
+ # else:
231
+ # H, W = shortest_edge, longest_edge
232
+ # image = image.resize((W, H))
233
+ # if return_pil:
234
+ # images.append(image)
235
+ # else:
236
+ # buffered = BytesIO()
237
+ # image.save(buffered, format="PNG")
238
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
239
+ # images.append(img_b64_str)
240
+ return images
241
+
242
+ def to_gradio_chatbot(self):
243
+ ret = []
244
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
245
+ if i % 2 == 0:
246
+ if type(msg) is tuple:
247
+ # import base64
248
+ # from io import BytesIO
249
+ # from PIL import Image
250
+ # msg, image, image_process_mode = msg
251
+ # max_hw, min_hw = max(image.size), min(image.size)
252
+ # aspect_ratio = max_hw / min_hw
253
+ # max_len, min_len = 800, 400
254
+ # shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
255
+ # longest_edge = int(shortest_edge * aspect_ratio)
256
+ # W, H = image.size
257
+ # if H > W:
258
+ # H, W = longest_edge, shortest_edge
259
+ # else:
260
+ # H, W = shortest_edge, longest_edge
261
+ # image = image.resize((W, H))
262
+ # buffered = BytesIO()
263
+ # image.save(buffered, format="JPEG")
264
+ # img_b64_str = base64.b64encode(buffered.getvalue()).decode()
265
+ # img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="user upload image" />'
266
+ # display image/video in the textbox
267
+ msg, image_or_video, image_process_mode = msg
268
+ ##print("imagebox:", image)
269
+ if isinstance(image_or_video, Image.Image):
270
+ # image is PIL object
271
+ img_b64_str = self.process_image(image_or_video, "Default", return_pil=False, image_format='JPEG')
272
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
273
+ msg = img_str + msg.replace('<image>', '').strip()
274
+ else:
275
+ # video is file path
276
+ vid_str = f'<video controls playsinline width="500" style="display: inline-block;" src="./file={image_or_video}"></video><br>'
277
+ msg = vid_str + msg.replace('<video>', '').strip()
278
+ ret.append([msg, None])
279
+ else:
280
+ ret.append([msg, None])
281
+ else:
282
+ ret[-1][-1] = msg
283
+ return ret
284
+
285
+ def copy(self):
286
+ return Conversation(
287
+ system=self.system,
288
+ roles=self.roles,
289
+ messages=[[x, y] for x, y in self.messages],
290
+ offset=self.offset,
291
+ sep_style=self.sep_style,
292
+ sep=self.sep,
293
+ sep2=self.sep2,
294
+ version=self.version)
295
+
296
+ def dict(self):
297
+ if (self.modality == "image" and len(self.get_images()) > 0) or \
298
+ (self.modality == "video" and len(self.get_videos()) > 0):
299
+ return {
300
+ "system": self.system,
301
+ "roles": self.roles,
302
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
303
+ "offset": self.offset,
304
+ "sep": self.sep,
305
+ "sep2": self.sep2,
306
+ "modality": self.modality
307
+ }
308
+ return {
309
+ "system": self.system,
310
+ "roles": self.roles,
311
+ "messages": self.messages,
312
+ "offset": self.offset,
313
+ "sep": self.sep,
314
+ "sep2": self.sep2,
315
+ }
316
+
317
+
318
+ conv_vicuna_v0 = Conversation(
319
+ system="A chat between a curious human and an artificial intelligence assistant. "
320
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
321
+ roles=("Human", "Assistant"),
322
+ messages=(
323
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
324
+ ("Assistant",
325
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
326
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
327
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
328
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
329
+ "renewable and non-renewable energy sources:\n"
330
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
331
+ "energy sources are finite and will eventually run out.\n"
332
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
333
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
334
+ "and other negative effects.\n"
335
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
336
+ "have lower operational costs than non-renewable sources.\n"
337
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
338
+ "locations than non-renewable sources.\n"
339
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
340
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
341
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
342
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
343
+ ),
344
+ offset=2,
345
+ sep_style=SeparatorStyle.SINGLE,
346
+ sep="###",
347
+ )
348
+
349
+ conv_llava_plain = Conversation(
350
+ system="",
351
+ roles=("", ""),
352
+ messages=(),
353
+ offset=0,
354
+ sep_style=SeparatorStyle.PLAIN,
355
+ sep="",
356
+ sep2="\n"
357
+ )
358
+
359
+ conv_llava_v0_mmtag = Conversation(
360
+ system="A chat between a curious user and an artificial intelligence assistant. "
361
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
362
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
363
+ roles=("Human", "Assistant"),
364
+ messages=(
365
+ ),
366
+ offset=0,
367
+ sep_style=SeparatorStyle.SINGLE,
368
+ sep="###",
369
+ version="v0_mmtag",
370
+ )
371
+
372
+ conv_llava_v0 = Conversation(
373
+ system="A chat between a curious human and an artificial intelligence assistant. "
374
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
375
+ roles=("Human", "Assistant"),
376
+ messages=(
377
+ ),
378
+ offset=0,
379
+ sep_style=SeparatorStyle.SINGLE,
380
+ sep="###",
381
+ )
382
+
383
+ conv_vicuna_v1 = Conversation(
384
+ system="A chat between a curious user and an artificial intelligence assistant. "
385
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
386
+ roles=("USER", "ASSISTANT"),
387
+ version="v1",
388
+ messages=(),
389
+ offset=0,
390
+ sep_style=SeparatorStyle.TWO,
391
+ sep=" ",
392
+ sep2="</s>",
393
+ )
394
+
395
+ conv_llava_v1_mmtag = Conversation(
396
+ system="A chat between a curious user and an artificial intelligence assistant. "
397
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
398
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
399
+ roles=("USER", "ASSISTANT"),
400
+ messages=(),
401
+ offset=0,
402
+ sep_style=SeparatorStyle.TWO,
403
+ sep=" ",
404
+ sep2="</s>",
405
+ version="v1_mmtag",
406
+ )
407
+
408
+ conv_llava_v1 = Conversation(
409
+ system="A chat between a curious human and an artificial intelligence assistant. "
410
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
411
+ roles=("USER", "ASSISTANT"),
412
+ version="v1",
413
+ messages=(),
414
+ offset=0,
415
+ sep_style=SeparatorStyle.TWO,
416
+ sep=" ",
417
+ sep2="</s>",
418
+ )
419
+
420
+ conv_llava_llama2 = Conversation(
421
+ system="You are a helpful language and vision assistant. "
422
+ "You are able to understand the visual content that the user provides, "
423
+ "and assist the user with a variety of tasks using natural language.",
424
+ roles=("USER", "ASSISTANT"),
425
+ version="llama2",
426
+ messages=(),
427
+ offset=0,
428
+ sep_style=SeparatorStyle.LLAMA2,
429
+ sep="<s>",
430
+ sep2="</s>",
431
+ )
432
+
433
+ conv_llama2 = Conversation(
434
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
435
+
436
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
437
+ roles=("USER", "ASSISTANT"),
438
+ version="llama2",
439
+ messages=(),
440
+ offset=0,
441
+ sep_style=SeparatorStyle.LLAMA2,
442
+ sep="<s>",
443
+ sep2="</s>",
444
+ )
445
+
446
+ conv_mistral = Conversation(
447
+ system="A chat between a curious user and an artificial intelligence assistant. "
448
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
449
+ roles=("USER", "ASSISTANT"),
450
+ version="llama2",
451
+ messages=(),
452
+ offset=0,
453
+ sep_style=SeparatorStyle.LLAMA2,
454
+ sep="",
455
+ sep2="</s>",
456
+ )
457
+
458
+ conv_qwen = Conversation(
459
+ system="<|im_start|>system\nYou are a helpful assistant.",
460
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
461
+ messages=(),
462
+ offset=0,
463
+ sep_style=SeparatorStyle.QWEN,
464
+ sep="<|im_end|>",
465
+ version="qwen",
466
+ )
467
+
468
+ conv_qwen_plain = Conversation(
469
+ system="",
470
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
471
+ messages=(),
472
+ offset=0,
473
+ sep_style=SeparatorStyle.PLAIN,
474
+ sep="<|im_end|>",
475
+ sep2="<|im_end|>",
476
+ version="qwen_plain",
477
+ )
478
+
479
+ default_conversation = conv_mistral
480
+ conv_templates = {
481
+ "default": conv_vicuna_v0,
482
+ # pretrain template
483
+ "plain": conv_llava_plain,
484
+ # llava v0
485
+ "v0": conv_vicuna_v0,
486
+ "v0_plain": conv_llava_plain,
487
+ "v0_mmtag": conv_llava_v0_mmtag,
488
+ "llava_v0": conv_llava_v0,
489
+ # llava v1
490
+ "v1": conv_vicuna_v1,
491
+ "v1_mmtag": conv_llava_v1_mmtag,
492
+ "llava_v1": conv_llava_v1,
493
+ "vicuna_v1": conv_vicuna_v1,
494
+ # llava v1.5
495
+ "llava_llama2": conv_llava_llama2,
496
+ # llama2
497
+ "llama2": conv_llama2,
498
+ # mistral
499
+ "mistral": conv_mistral,
500
+ # qwen
501
+ "qwen": conv_qwen,
502
+ "qwen_plain": conv_qwen_plain,
503
+ }
504
+
505
+
506
+ if __name__ == "__main__":
507
+ print(default_conversation.get_prompt())
videollama2/eval/eval_video_cap_msvc_correctness.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import ast
4
+ import time
5
+ import json
6
+ import argparse
7
+ from tqdm import tqdm
8
+ from multiprocessing.pool import Pool
9
+
10
+ import openai
11
+ from openai import AzureOpenAI
12
+
13
+
14
+ def init():
15
+ client = AzureOpenAI(
16
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
17
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
18
+ api_version="2024-02-15-preview"
19
+ )
20
+
21
+ return client
22
+
23
+
24
+ def interaction(client, message_text):
25
+ completion = client.chat.completions.create(
26
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
27
+ messages = message_text,
28
+ temperature=0.7,
29
+ max_tokens=800,
30
+ top_p=0.95,
31
+ frequency_penalty=0,
32
+ presence_penalty=0,
33
+ stop=None
34
+ )
35
+
36
+ return completion
37
+
38
+
39
+ def annotate(prediction_set, caption_files, output_dir):
40
+ """
41
+ Evaluates question and answer pairs using GPT-3
42
+ Returns a score for correctness.
43
+ """
44
+
45
+ for file in tqdm(caption_files):
46
+ key = file[:-5] # Strip file extension
47
+ qa_set = prediction_set[key]
48
+ question = qa_set['q']
49
+ answer = str(qa_set['a'])
50
+ pred = qa_set['pred']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content":
56
+ "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
57
+ "Your task is to compare the predicted answer with these correct answers and determine if they are factually consistent. Here's how you can accomplish the task:"
58
+ "------"
59
+ "##INSTRUCTIONS: "
60
+ "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
61
+ "- The predicted answer must be factually accurate and align with the video content.\n"
62
+ "- Consider synonyms or paraphrases as valid matches.\n"
63
+ "- Evaluate the factual accuracy of the prediction compared to the answer."
64
+ },
65
+ {
66
+ "role": "user",
67
+ "content":
68
+ "Please evaluate the following video-based question-answer pair:\n\n"
69
+ f"Question: {question}\n"
70
+ f"Correct Answers: {answer}\n"
71
+ f"Predicted Answer: {pred}\n\n"
72
+ "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
73
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
74
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
75
+ "For example, your response should look like this: {''score': 4.8}."
76
+ }
77
+ ]
78
+ completion = interaction(client, message)
79
+ # Convert response to a Python dictionary.
80
+ response_message = completion.choices[0].message.content
81
+ response_dict = ast.literal_eval(response_message)
82
+ result_qa_pair = [response_dict, qa_set]
83
+ # # Save the question-answer pairs to a json file.
84
+ with open(f"{output_dir}/{key}.json", "w") as f:
85
+ json.dump(result_qa_pair, f)
86
+
87
+ except Exception as e:
88
+ print(f"Error processing file '{key}': {e}")
89
+
90
+ time.sleep(1)
91
+
92
+
93
+ def longest_repeating_substring(s):
94
+ n = len(s)
95
+ dp = [[0] * (n+1) for _ in range(n+1)]
96
+ res = ""
97
+ res_length = 0
98
+
99
+ index = 0
100
+ for i in range(1, n+1):
101
+ for j in range(i+1, n+1):
102
+ if (dp[i-1][j-1] > 0 and dp[i-1][j-1] < (j-i)) or s[i-1] == s[j-1]:
103
+ dp[i][j] = dp[i-1][j-1] + 1
104
+ if dp[i][j] > res_length:
105
+ res_length = dp[i][j]
106
+ index = max(i, index)
107
+ else:
108
+ dp[i][j] = 0
109
+
110
+ if res_length > 0:
111
+ for i in range(index-res_length+1, index+1):
112
+ res = res + s[i-1]
113
+
114
+ return res
115
+
116
+
117
+ def main(args):
118
+ if args.num_chunks > 1:
119
+ pred_contents = []
120
+ for _idx in range(args.num_chunks):
121
+ file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
122
+ pred_contents += [json.loads(line) for line in open(file)]
123
+ else:
124
+ pred_contents = [json.loads(line) for line in open(args.pred_path)]
125
+
126
+ # Dictionary to store the count of occurrences for each video_id
127
+ video_id_counts = {}
128
+ new_pred_contents = []
129
+
130
+ # Iterate through each sample in pred_contents
131
+ for sample in pred_contents:
132
+ video_id = sample["video_name"]
133
+ if video_id in video_id_counts:
134
+ video_id_counts[video_id] += 1
135
+ else:
136
+ video_id_counts[video_id] = 0
137
+
138
+ # Create a new sample with the modified key
139
+ new_sample = sample
140
+ new_sample["video_name"] = f"{video_id.split('/')[-1].split('.')[0]}_{video_id_counts[video_id]}"
141
+ new_pred_contents.append(new_sample)
142
+
143
+ # Generating list of id's and corresponding files
144
+ id_list = [x["video_name"] for x in new_pred_contents]
145
+ caption_files = [f"{id}.json" for id in id_list]
146
+
147
+ output_dir = args.output_dir
148
+ # Generate output directory if not exists.
149
+ if not os.path.exists(output_dir):
150
+ os.makedirs(output_dir)
151
+
152
+ # Preparing dictionary of question-answer sets
153
+ prediction_set = {}
154
+ for sample in new_pred_contents:
155
+ id = sample["video_name"]
156
+ # print(sample)
157
+ question = sample["question"]
158
+ answer = sample["answer"]
159
+ pred = sample["pred"]
160
+ qa_set = {"q": question, "a": answer, "pred": pred}
161
+ prediction_set[id] = qa_set
162
+
163
+ # # Set the OpenAI API key.
164
+ # openai.api_key = args.api_key # Your API key here
165
+ # if args.api_base:
166
+ # openai.api_base = args.api_base # Your API base here
167
+ num_tasks = args.num_tasks
168
+
169
+ # While loop to ensure that all captions are processed.
170
+ while True:
171
+ try:
172
+ # Files that have not been processed yet.
173
+ completed_files = os.listdir(output_dir)
174
+ print(f"completed_files: {len(completed_files)}")
175
+
176
+ # Files that have not been processed yet.
177
+ incomplete_files = [f for f in caption_files if f not in completed_files]
178
+ print(f"incomplete_files: {len(incomplete_files)}")
179
+
180
+ # Break the loop when there are no incomplete files
181
+ if len(incomplete_files) == 0:
182
+ break
183
+ if len(incomplete_files) <= num_tasks:
184
+ num_tasks = 1
185
+
186
+ # Split tasks into parts.
187
+ part_len = len(incomplete_files) // num_tasks
188
+ all_parts = [incomplete_files[i : i + part_len] for i in range(0, len(incomplete_files), part_len)]
189
+ task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
190
+ print("Generate", len(all_parts), "subprocess.")
191
+
192
+ # Use a pool of workers to process the files in parallel.
193
+ # with Pool() as pool:
194
+ # pool.starmap(annotate, task_args)
195
+ # import pdb;pdb.set_trace()
196
+ annotate(*task_args[0])
197
+
198
+ except Exception as e:
199
+ print(f"Error: {e}")
200
+
201
+ # Combine all the processed files into one
202
+ combined_contents = {}
203
+ json_path = args.output_json
204
+
205
+ # Iterate through json files
206
+ for file_name in os.listdir(output_dir):
207
+ if file_name.endswith(".json"):
208
+ file_path = os.path.join(output_dir, file_name)
209
+ with open(file_path, "r") as json_file:
210
+ try:
211
+ content = json.load(json_file)
212
+ combined_contents[file_name[:-5]] = content
213
+ except Exception as e:
214
+ print(f"Error: {e}")
215
+ pass
216
+
217
+ # Calculate average score
218
+ score_sum = 0
219
+ count = 0
220
+ for key, result in combined_contents.items():
221
+ count += 1
222
+ try:
223
+ # key = result[0].keys()[0]
224
+ # import pdb; pdb.set_trace()
225
+ for _ in result[0].keys():
226
+ score_match = result[0][_]
227
+ score = int(score_match)
228
+ score_sum += score
229
+ break
230
+ except Exception as e:
231
+ print(f"Error processing file '{key}': {e}")
232
+ import pdb; pdb.set_trace()
233
+ average_score = score_sum / count
234
+ combined_contents["average_score"] = average_score
235
+ with open(json_path, "w") as json_file:
236
+ json.dump(combined_contents, json_file, indent=4)
237
+ print("Average score for correctness:", average_score)
238
+
239
+
240
+ if __name__ == "__main__":
241
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
242
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
243
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
244
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
245
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
246
+ parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
247
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
248
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
249
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
250
+ args = parser.parse_args()
251
+
252
+ # Set the OpenAI API key.
253
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
254
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
255
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
256
+
257
+ client = init()
258
+
259
+ main(args)
videollama2/eval/eval_video_cap_msvc_detailedness.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import os
3
+ import ast
4
+ import time
5
+ import json
6
+ import argparse
7
+ from tqdm import tqdm
8
+ from multiprocessing.pool import Pool
9
+
10
+ import openai
11
+ from openai import AzureOpenAI
12
+
13
+
14
+ def init():
15
+ client = AzureOpenAI(
16
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
17
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
18
+ api_version="2024-02-15-preview"
19
+ )
20
+
21
+ return client
22
+
23
+
24
+ def interaction(client, message_text):
25
+ completion = client.chat.completions.create(
26
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
27
+ messages = message_text,
28
+ temperature=0.7,
29
+ max_tokens=800,
30
+ top_p=0.95,
31
+ frequency_penalty=0,
32
+ presence_penalty=0,
33
+ stop=None
34
+ )
35
+
36
+ return completion
37
+
38
+
39
+ def annotate(prediction_set, caption_files, output_dir):
40
+ """
41
+ Evaluates question and answer pairs using GPT-3
42
+ Returns a score for correctness.
43
+ """
44
+
45
+ for file in tqdm(caption_files):
46
+ key = file[:-5] # Strip file extension
47
+ qa_set = prediction_set[key]
48
+ question = qa_set['q']
49
+ answer = str(qa_set['a'])
50
+ pred = qa_set['pred']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content": "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
56
+ "Your task is to compare the predicted answer with these correct answers and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
57
+ "------"
58
+ "##INSTRUCTIONS: "
59
+ "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
60
+ "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
61
+ "- Consider synonyms or paraphrases as valid matches.\n"
62
+ "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity.",
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content": "Please evaluate the following video-based question-answer pair:\n\n"
67
+ f"Question: {question}\n"
68
+ f"Correct Answers: {answer}\n"
69
+ f"Predicted Answer: {pred}\n\n"
70
+ "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
71
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
72
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
73
+ "For example, your response should look like this: {''score': 4.8}.",
74
+ },
75
+ ]
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+ # # Save the question-answer pairs to a json file.
82
+ with open(f"{output_dir}/{key}.json", "w") as f:
83
+ json.dump(result_qa_pair, f)
84
+
85
+ except Exception as e:
86
+ print(f"Error processing file '{key}': {e}")
87
+
88
+ time.sleep(1)
89
+
90
+
91
+ def longest_repeating_substring(s):
92
+ n = len(s)
93
+ dp = [[0] * (n+1) for _ in range(n+1)]
94
+ res = ""
95
+ res_length = 0
96
+
97
+ index = 0
98
+ for i in range(1, n+1):
99
+ for j in range(i+1, n+1):
100
+ if (dp[i-1][j-1] > 0 and dp[i-1][j-1] < (j-i)) or s[i-1] == s[j-1]:
101
+ dp[i][j] = dp[i-1][j-1] + 1
102
+ if dp[i][j] > res_length:
103
+ res_length = dp[i][j]
104
+ index = max(i, index)
105
+ else:
106
+ dp[i][j] = 0
107
+
108
+ if res_length > 0:
109
+ for i in range(index-res_length+1, index+1):
110
+ res = res + s[i-1]
111
+
112
+ return res
113
+
114
+
115
+ def main(args):
116
+ if args.num_chunks > 1:
117
+ pred_contents = []
118
+ for _idx in range(args.num_chunks):
119
+ file = os.path.join(args.pred_path, f"{args.num_chunks}_{_idx}.json")
120
+ pred_contents += [json.loads(line) for line in open(file)]
121
+ else:
122
+ pred_contents = [json.loads(line) for line in open(args.pred_path)]
123
+
124
+ # Dictionary to store the count of occurrences for each video_id
125
+ video_id_counts = {}
126
+ new_pred_contents = []
127
+
128
+ # Iterate through each sample in pred_contents
129
+ for sample in pred_contents:
130
+ video_id = sample["video_name"]
131
+ if video_id in video_id_counts:
132
+ video_id_counts[video_id] += 1
133
+ else:
134
+ video_id_counts[video_id] = 0
135
+
136
+ # Create a new sample with the modified key
137
+ new_sample = sample
138
+ new_sample["video_name"] = f"{video_id.split('/')[-1].split('.')[0]}_{video_id_counts[video_id]}"
139
+ new_pred_contents.append(new_sample)
140
+
141
+ # Generating list of id's and corresponding files
142
+ id_list = [x["video_name"] for x in new_pred_contents]
143
+ caption_files = [f"{id}.json" for id in id_list]
144
+
145
+ output_dir = args.output_dir
146
+ # Generate output directory if not exists.
147
+ if not os.path.exists(output_dir):
148
+ os.makedirs(output_dir)
149
+
150
+ # Preparing dictionary of question-answer sets
151
+ prediction_set = {}
152
+ for sample in new_pred_contents:
153
+ id = sample["video_name"]
154
+ # print(sample)
155
+ question = sample["question"]
156
+ answer = sample["answer"]
157
+ pred = sample["pred"]
158
+ qa_set = {"q": question, "a": answer, "pred": pred}
159
+ prediction_set[id] = qa_set
160
+
161
+ # # Set the OpenAI API key.
162
+ # openai.api_key = args.api_key # Your API key here
163
+ # if args.api_base:
164
+ # openai.api_base = args.api_base # Your API base here
165
+ num_tasks = args.num_tasks
166
+
167
+ # While loop to ensure that all captions are processed.
168
+ while True:
169
+ try:
170
+ # Files that have not been processed yet.
171
+ completed_files = os.listdir(output_dir)
172
+ print(f"completed_files: {len(completed_files)}")
173
+
174
+ # Files that have not been processed yet.
175
+ incomplete_files = [f for f in caption_files if f not in completed_files]
176
+ print(f"incomplete_files: {len(incomplete_files)}")
177
+
178
+ # Break the loop when there are no incomplete files
179
+ if len(incomplete_files) == 0:
180
+ break
181
+ if len(incomplete_files) <= num_tasks:
182
+ num_tasks = 1
183
+
184
+ # Split tasks into parts.
185
+ part_len = len(incomplete_files) // num_tasks
186
+ all_parts = [incomplete_files[i : i + part_len] for i in range(0, len(incomplete_files), part_len)]
187
+ task_args = [(prediction_set, part, args.output_dir) for part in all_parts]
188
+ print("Generate", len(all_parts), "subprocess.")
189
+
190
+ # Use a pool of workers to process the files in parallel.
191
+ # with Pool() as pool:
192
+ # pool.starmap(annotate, task_args)
193
+ # import pdb;pdb.set_trace()
194
+ annotate(*task_args[0])
195
+
196
+ except Exception as e:
197
+ print(f"Error: {e}")
198
+
199
+ # Combine all the processed files into one
200
+ combined_contents = {}
201
+ json_path = args.output_json
202
+
203
+ # Iterate through json files
204
+ for file_name in os.listdir(output_dir):
205
+ if file_name.endswith(".json"):
206
+ file_path = os.path.join(output_dir, file_name)
207
+ with open(file_path, "r") as json_file:
208
+ try:
209
+ content = json.load(json_file)
210
+ combined_contents[file_name[:-5]] = content
211
+ except Exception as e:
212
+ print(f"Error: {e}")
213
+ pass
214
+
215
+ # Calculate average score
216
+ score_sum = 0
217
+ count = 0
218
+ for key, result in combined_contents.items():
219
+ count += 1
220
+ try:
221
+ # key = result[0].keys()[0]
222
+ # import pdb; pdb.set_trace()
223
+ for _ in result[0].keys():
224
+ score_match = result[0][_]
225
+ score = int(score_match)
226
+ score_sum += score
227
+ break
228
+ except Exception as e:
229
+ print(f"Error processing file '{key}': {e}")
230
+ import pdb; pdb.set_trace()
231
+ average_score = score_sum / count
232
+ combined_contents["average_score"] = average_score
233
+ with open(json_path, "w") as json_file:
234
+ json.dump(combined_contents, json_file, indent=4)
235
+ print("Average score for detailedness:", average_score)
236
+
237
+
238
+ if __name__ == "__main__":
239
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
240
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
241
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
242
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
243
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
244
+ parser.add_argument("--num_chunks", default=1, type=int, help="Result splits")
245
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
246
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
247
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
248
+ args = parser.parse_args()
249
+
250
+ # Set the OpenAI API key.
251
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
252
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
253
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
254
+
255
+ client = init()
256
+
257
+ main(args)
videollama2/eval/eval_video_mcqa_mvbench.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import argparse
3
+ from tabulate import tabulate
4
+
5
+
6
+ tasks = {
7
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
8
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
9
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
10
+ "Fine-grained Action": ("fine_grained_action.json", "pMoments_in_Time_Raw/videos/", "video", False),
11
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
12
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
13
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
14
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
15
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
16
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
17
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
18
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
19
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
20
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
21
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
22
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
23
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
24
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
25
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
26
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
27
+ }
28
+
29
+
30
+ def main():
31
+ args = parse_args()
32
+ res = [eval(x.strip()) for x in open(args.pred_path, 'r').readlines()]
33
+ task_types = tasks.keys()
34
+ task_acc = {x: [] for x in task_types}
35
+ acc = []
36
+ for i, x in enumerate(res):
37
+ value = 1
38
+ if x['pred'] != x['gt']:
39
+ value = 0
40
+ acc.append(value)
41
+ task_acc[x['task_type']].append(value)
42
+ acc = sum(acc) * 100 / len(acc)
43
+ task_acc = {x: sum(task_acc[x]) * 100 / len(task_acc[x]) for x in task_acc}
44
+ print(f"{args.pred_path}:", acc)
45
+ task_names = list(tasks.keys())
46
+
47
+ table_data = []
48
+ for i in range(len(task_names) // 4):
49
+ row_task_names = task_names[i * 4: (i + 1) * 4]
50
+ row_task_acc = [task_acc[x] for x in row_task_names]
51
+ table_data.append(row_task_names)
52
+ table_data.append(row_task_acc)
53
+ print(tabulate(table_data, floatfmt=".1f"), '\n')
54
+
55
+
56
+ def parse_args():
57
+ parser = argparse.ArgumentParser(description="Evaluate video captioning.")
58
+ parser.add_argument("--pred_path", default=r'', help="The path to file containing prediction.")
59
+ args = parser.parse_args()
60
+ return args
61
+
62
+
63
+ if __name__ == '__main__':
64
+ main()
videollama2/eval/eval_video_mcqa_videomme.py ADDED
@@ -0,0 +1,277 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import json
4
+ import argparse
5
+ from typing import List, Dict, Optional, Union
6
+
7
+ CATEGORIES = [
8
+ "Knowledge",
9
+ "Film & Television",
10
+ "Sports Competition",
11
+ "Artistic Performance",
12
+ "Life Record",
13
+ "Multilingual"
14
+ ]
15
+
16
+ SUB_CATEGORIES = [
17
+ "Humanity & History",
18
+ "Literature & Art",
19
+ "Biology & Medicine",
20
+ "Finance & Commerce",
21
+ "Astronomy",
22
+ "Geography",
23
+ "Law",
24
+ "Life Tip",
25
+ "Technology",
26
+ "Animation",
27
+ "Movie & TV Show",
28
+ "Documentary",
29
+ "News Report",
30
+ "Esports",
31
+ "Basketball",
32
+ "Football",
33
+ "Athletics",
34
+ "Other Sports",
35
+ "Stage Play",
36
+ "Magic Show",
37
+ "Variety Show",
38
+ "Acrobatics",
39
+ "Handicraft",
40
+ "Food",
41
+ "Fashion",
42
+ "Daily Life",
43
+ "Travel",
44
+ "Pet & Animal",
45
+ "Exercise",
46
+ "Multilingual"
47
+ ]
48
+
49
+ TASK_CATEGORIES = [
50
+ "Temporal Perception",
51
+ "Spatial Perception",
52
+ "Attribute Perception",
53
+ "Action Recognition",
54
+ "Object Recognition",
55
+ "OCR Problems",
56
+ "Counting Problem",
57
+ "Temporal Reasoning",
58
+ "Spatial Reasoning",
59
+ "Action Reasoning",
60
+ "Object Reasoning",
61
+ "Information Synopsis",
62
+ ]
63
+
64
+
65
+ def extract_characters_regex(s):
66
+ s = s.strip()
67
+ answer_prefixes = [
68
+ "The best answer is",
69
+ "The correct answer is",
70
+ "The answer is",
71
+ "The answer",
72
+ "The best option is"
73
+ "The correct option is",
74
+ "Best answer:"
75
+ "Best option:",
76
+ ]
77
+ for answer_prefix in answer_prefixes:
78
+ s = s.replace(answer_prefix, "")
79
+
80
+ if len(s.split()) > 10 and not re.search("[ABCD]", s):
81
+ return ""
82
+ matches = re.search(r'[ABCD]', s)
83
+ if matches is None:
84
+ return ""
85
+ return matches[0]
86
+
87
+
88
+ def eval_your_results(
89
+ your_results_path: str,
90
+ video_types: Optional[Union[List[str], str]] = None,
91
+ skip_missing: Optional[bool] = True,
92
+ return_categories_accuracy: Optional[bool] = True,
93
+ return_sub_categories_accuracy: Optional[bool] = False,
94
+ return_task_types_accuracy: Optional[bool] = False,
95
+ gt_answer_key: Optional[str] = "answer",
96
+ your_answer_key: Optional[str] = "response"
97
+
98
+ ):
99
+ """
100
+ Evaluate your results against the ground truth
101
+
102
+ Args:
103
+ - your_results_path (str): Path to your results file
104
+ - video_types (Optional[List[str], str]): List of video types to evaluate.
105
+ - skip_missing (Optional[bool]): If True, missing files will be skipped. If False, an error will be raised if there are missing files.
106
+ - return_categories_accuracy (Optional[bool]): If True, the accuracy for each video category will be returned.
107
+ - return_sub_categories_accuracy (Optional[bool]): If True, the accuracy for each video sub category will be returned.
108
+ - return_task_types_accuracy (Optional[bool]): If True, the accuracy for each task category will be returned.
109
+ - gt_answer_key (Optional[str]): Key to access the ground truth answer in the results file.
110
+ - your_answer_key (Optional[str]): Key to access your answer in the results file.
111
+ """
112
+
113
+ # Load your results
114
+ with open(your_results_path, 'r') as f:
115
+ your_results = json.load(f)
116
+
117
+ if isinstance(video_types, str):
118
+ video_types = video_types.split(",")
119
+
120
+ q_type_dict = {}
121
+ v_type_dict = {}
122
+ v_sub_type_dict = {}
123
+
124
+
125
+ for video_type in video_types:
126
+
127
+ # Filter your results based on video types
128
+ your_results_video_type = [item for item in your_results if item["duration"] == video_type]
129
+
130
+ # Task Categories
131
+ q_type_dict[video_type] = {}
132
+ for q_type in TASK_CATEGORIES:
133
+ q_type_dict[video_type][q_type] = {"correct": 0, "answered": 0}
134
+
135
+ # Video categories
136
+ v_type_dict[video_type] = {}
137
+ for v_type in CATEGORIES:
138
+ v_type_dict[video_type][v_type] = {"correct": 0, "answered": 0}
139
+
140
+ v_sub_type_dict[video_type] = {}
141
+ for v_sub_type in SUB_CATEGORIES:
142
+ v_sub_type_dict[video_type][v_sub_type] = {"correct": 0, "answered": 0}
143
+
144
+ if not skip_missing:
145
+ # Check if the number of files in your results and ground truth are the same
146
+ assert len(your_results_video_type) == 300, f"Number of files in {video_type} is not 300. Check if there are missing files."
147
+
148
+ for item in your_results_video_type:
149
+
150
+ if skip_missing and item["missing"]:
151
+ continue
152
+
153
+ # Get the video category, sub category and question category
154
+ video_category = item["domain"]
155
+ video_sub_category = item["sub_category"]
156
+
157
+ questions = item["questions"]
158
+
159
+ for question in questions:
160
+ q_type = question["task_type"]
161
+
162
+ # Get the ground truth and your response
163
+ gt_answer = question[gt_answer_key]
164
+ response = question[your_answer_key]
165
+
166
+ # Extract the answer from the response
167
+ extration = extract_characters_regex(response)
168
+
169
+ if extration != "":
170
+ q_type_dict[video_type][q_type]["answered"] += 1
171
+ q_type_dict[video_type][q_type]["correct"] += extration == gt_answer
172
+
173
+ v_type_dict[video_type][video_category]["answered"] += 1
174
+ v_type_dict[video_type][video_category]["correct"] += extration == gt_answer
175
+
176
+ v_sub_type_dict[video_type][video_sub_category]["answered"] += 1
177
+ v_sub_type_dict[video_type][video_sub_category]["correct"] += extration == gt_answer
178
+
179
+
180
+ # Print the results for each video type
181
+ for video_type in video_types:
182
+
183
+ print("=====================================")
184
+ print(f"Evaluation on video Type: {video_type}")
185
+ print("=====================================")
186
+ if return_categories_accuracy:
187
+ print("-------------------------------------")
188
+ print("Video Domains")
189
+ print("-------------------------------------")
190
+ for v_type in v_type_dict[video_type]:
191
+ print(f"{v_type}: {100 * v_type_dict[video_type][v_type]['correct'] / v_type_dict[video_type][v_type]['answered'] if v_type_dict[video_type][v_type]['answered'] > 0 else 0 : .1f}%")
192
+ if return_sub_categories_accuracy:
193
+ print("-------------------------------------")
194
+ print("Video Sub Categories")
195
+ print("-------------------------------------")
196
+ for v_sub_type in v_sub_type_dict[video_type]:
197
+ print(f"{v_sub_type}: {100 * v_sub_type_dict[video_type][v_sub_type]['correct'] / v_sub_type_dict[video_type][v_sub_type]['answered'] if v_sub_type_dict[video_type][v_sub_type]['answered'] > 0 else 0 : .1f}%")
198
+ if return_task_types_accuracy:
199
+ print("-------------------------------------")
200
+ print("Task Categories")
201
+ print("-------------------------------------")
202
+ for q_type in q_type_dict[video_type]:
203
+ print(f"{q_type}: {100 * q_type_dict[video_type][q_type]['correct'] / q_type_dict[video_type][q_type]['answered'] if q_type_dict[video_type][q_type]['answered'] > 0 else 0 : .1f}%")
204
+
205
+ print("-------------------------------------")
206
+ print("Overall Performance")
207
+ print("-------------------------------------")
208
+ total_correct = sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES])
209
+ total_answered = sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES])
210
+ print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
211
+
212
+ print("\n")
213
+
214
+ # Print the results for the entire dataset
215
+ print("=====================================")
216
+ print("Evaluation on the entire dataset")
217
+ print("=====================================")
218
+
219
+ if return_categories_accuracy:
220
+ print("-------------------------------------")
221
+ print("Video Categories")
222
+ print("-------------------------------------")
223
+ for v_type in CATEGORIES:
224
+ total_correct = sum([v_type_dict[video_type][v_type]["correct"] for video_type in video_types])
225
+ total_answered = sum([v_type_dict[video_type][v_type]["answered"] for video_type in video_types])
226
+ print(f"{v_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
227
+
228
+
229
+ if return_sub_categories_accuracy:
230
+ print("-------------------------------------")
231
+ print("Video Sub Categories")
232
+ print("-------------------------------------")
233
+
234
+ for v_sub_type in SUB_CATEGORIES:
235
+ total_correct = sum([v_sub_type_dict[video_type][v_sub_type]["correct"] for video_type in video_types])
236
+ total_answered = sum([v_sub_type_dict[video_type][v_sub_type]["answered"] for video_type in video_types])
237
+ print(f"{v_sub_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
238
+
239
+
240
+ if return_task_types_accuracy:
241
+ print("-------------------------------------")
242
+ print("Task Categories")
243
+ print("-------------------------------------")
244
+ for q_type in TASK_CATEGORIES:
245
+
246
+ total_correct = sum([q_type_dict[video_type][q_type]["correct"] for video_type in video_types])
247
+ total_answered = sum([q_type_dict[video_type][q_type]["answered"] for video_type in video_types])
248
+ print(f"{q_type}: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
249
+
250
+ print("-------------------------------------")
251
+ print("Overall Performance")
252
+ print("-------------------------------------")
253
+ total_correct = sum([sum([q_type_dict[video_type][q_type]["correct"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
254
+ total_answered = sum([sum([q_type_dict[video_type][q_type]["answered"] for q_type in TASK_CATEGORIES]) for video_type in video_types])
255
+ print(f"Overall: {100 * total_correct / total_answered if total_answered > 0 else 0 : .1f}%")
256
+
257
+
258
+
259
+ if __name__ == "__main__":
260
+ parser = argparse.ArgumentParser()
261
+ parser.add_argument("--results_file", type=str, required=True)
262
+ parser.add_argument("--video_duration_type", type=str, required=True)
263
+ parser.add_argument("--return_categories_accuracy", action="store_true")
264
+ parser.add_argument("--return_sub_categories_accuracy", action="store_true")
265
+ parser.add_argument("--return_task_types_accuracy", action="store_true")
266
+ parser.add_argument("--skip_missing", action="store_true")
267
+
268
+ args = parser.parse_args()
269
+
270
+ eval_your_results(
271
+ args.results_file,
272
+ video_types=args.video_duration_type,
273
+ skip_missing=args.skip_missing,
274
+ return_categories_accuracy=args.return_categories_accuracy,
275
+ return_sub_categories_accuracy=args.return_sub_categories_accuracy,
276
+ return_task_types_accuracy=args.return_task_types_accuracy,
277
+ )
videollama2/eval/eval_video_oqa_activitynet.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import ast
3
+ import json
4
+ import time
5
+ import argparse
6
+ import traceback
7
+ from tqdm import tqdm
8
+ from concurrent.futures import ThreadPoolExecutor, as_completed
9
+
10
+ from openai import AzureOpenAI
11
+
12
+
13
+ def init():
14
+ client = AzureOpenAI(
15
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
16
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
17
+ api_version="2024-02-15-preview"
18
+ )
19
+
20
+ return client
21
+
22
+
23
+ def interaction(client, message_text):
24
+ completion = client.chat.completions.create(
25
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
26
+ messages = message_text,
27
+ temperature=0.7,
28
+ max_tokens=800,
29
+ top_p=0.95,
30
+ frequency_penalty=0,
31
+ presence_penalty=0,
32
+ stop=None
33
+ )
34
+
35
+ return completion
36
+
37
+
38
+ def prompt_gpt(question, answer, pred, key, qa_set, output_dir):
39
+ message = [
40
+ {
41
+ "role": "system",
42
+ "content":
43
+ "You are an intelligent chatbot designed for evaluating the correctness of generative outputs for question-answer pairs. "
44
+ "Your task is to compare the predicted answer with the correct answer and determine if they match meaningfully. Here's how you can accomplish the task:"
45
+ "------"
46
+ "##INSTRUCTIONS: "
47
+ "- Focus on the meaningful match between the predicted answer and the correct answer.\n"
48
+ "- Consider synonyms or paraphrases as valid matches.\n"
49
+ "- Evaluate the correctness of the prediction compared to the answer."
50
+ },
51
+ {
52
+ "role": "user",
53
+ "content":
54
+ "Please evaluate the following video-based question-answer pair:\n\n"
55
+ f"Question: {question}\n"
56
+ f"Correct Answer: {answer}\n"
57
+ f"Predicted Answer: {pred}\n\n"
58
+ "Provide your evaluation only as a yes/no and score where the score is an integer value between 0 and 5, with 5 indicating the highest meaningful match. "
59
+ "Please generate the response in the form of a Python dictionary string with keys 'pred' and 'score', where value of 'pred' is a string of 'yes' or 'no' and value of 'score' is in INTEGER, not STRING."
60
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
61
+ "For example, your response should look like this: {'pred': 'yes', 'score': 4.8}."
62
+ }
63
+ ]
64
+ completion = interaction(client, message)
65
+ # Convert response to a Python dictionary.
66
+ response_message = completion.choices[0].message.content
67
+ response_dict = ast.literal_eval(response_message)
68
+ result_qa_pair = [response_dict, qa_set]
69
+ # # Save the question-answer pairs to a json file.
70
+ with open(f"{output_dir}/{key}.json", "w") as f:
71
+ json.dump(result_qa_pair, f)
72
+
73
+
74
+ def annotate(task_arg):
75
+ """
76
+ Evaluates question and answer pairs using GPT-3
77
+ Returns a score for correctness.
78
+ """
79
+ prediction_set, caption_files, output_dir, args = task_arg
80
+
81
+ for file in tqdm(caption_files):
82
+ key = file[:-5] # Strip file extension
83
+ qa_set = prediction_set[key]
84
+ question = qa_set['q']
85
+ answer = qa_set['a']
86
+ pred = qa_set['p']
87
+ try:
88
+ prompt_gpt(question, answer, pred, key, qa_set, output_dir)
89
+ except Exception as e:
90
+ prompt_gpt(question, answer, pred[:50], key, qa_set, output_dir)
91
+ traceback.print_exc()
92
+
93
+ time.sleep(1)
94
+
95
+
96
+ def main(args):
97
+
98
+ file = open(args.pred_path)
99
+ new_pred_contents = [eval(i.strip()) for i in file.readlines()]
100
+
101
+ # Generating list of id's and corresponding files
102
+ id_list = [x['id'] for x in new_pred_contents]
103
+ caption_files = [f"{id}.json" for id in id_list]
104
+
105
+ output_dir = args.output_dir
106
+ # Generate output directory if not exists.
107
+ if not os.path.exists(output_dir):
108
+ os.makedirs(output_dir)
109
+
110
+ # Preparing dictionary of question-answer sets
111
+ prediction_set = {}
112
+ for sample in new_pred_contents:
113
+ id = sample['id']
114
+ question = sample['question']
115
+ answer = sample['answer']
116
+ pred = sample['pred']
117
+ qa_set = {"q": question, "a": answer, "p": pred}
118
+ prediction_set[id] = qa_set
119
+
120
+ num_tasks = args.num_tasks
121
+
122
+ # While loop to ensure that all captions are processed.
123
+ while True:
124
+ try:
125
+ # Files that have not been processed yet.
126
+ completed_files = os.listdir(output_dir)
127
+ print(f"completed_files: {len(completed_files)}")
128
+
129
+ # Files that have not been processed yet.
130
+ incomplete_files = [f for f in caption_files if f not in completed_files]
131
+ print(f"incomplete_files: {len(incomplete_files)}")
132
+
133
+ # Break the loop when there are no incomplete files
134
+ if len(incomplete_files) == 0:
135
+ break
136
+ if len(incomplete_files) <= num_tasks:
137
+ num_tasks = 1
138
+
139
+ # Split tasks into parts.
140
+ part_len = len(incomplete_files) // num_tasks
141
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
142
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
143
+
144
+ # Use a pool of workers to process the files in parallel.
145
+ with ThreadPoolExecutor(max_workers=args.num_tasks) as executor:
146
+ list(tqdm(executor.map(annotate, task_args), total=len(task_args)))
147
+
148
+ except Exception as e:
149
+ print(f"Error: {e}")
150
+
151
+ # multiprocessing to combine json files
152
+ def combine_json(file_name):
153
+ file_path = os.path.join(output_dir, file_name)
154
+ with open(file_path, "r") as json_file:
155
+ content = json.load(json_file)
156
+ return (file_name[:-5], content)
157
+
158
+ files = os.listdir(output_dir)
159
+ with ThreadPoolExecutor(max_workers=64) as executor:
160
+ combined_contents = list(tqdm(executor.map(combine_json, files), total=len(files)))
161
+
162
+ # Calculate average score and accuracy
163
+ score_sum = 0
164
+ count = 0
165
+ yes_count = 0
166
+ no_count = 0
167
+ for key, result in tqdm(combined_contents):
168
+ try:
169
+ # Computing score
170
+ count += 1
171
+ score_match = result[0]['score']
172
+ score = int(score_match)
173
+ score_sum += score
174
+
175
+ # Computing accuracy
176
+ pred = result[0]['pred']
177
+ if "yes" in pred.lower():
178
+ yes_count += 1
179
+ elif "no" in pred.lower():
180
+ no_count += 1
181
+ except:
182
+ print(result)
183
+
184
+ average_score = score_sum / count
185
+ accuracy = yes_count / (yes_count + no_count)
186
+ print("Yes count:", yes_count)
187
+ print("No count:", no_count)
188
+ print("Accuracy:", accuracy)
189
+ print("Average score:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_1_correctness.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3
40
+ Returns a score for correctness.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ message = [
51
+ {
52
+ "role": "system",
53
+ "content":
54
+ "You are an intelligent chatbot designed for evaluating the factual accuracy of generative outputs for video-based question-answer pairs. "
55
+ "Your task is to compare the predicted answer with the correct answer and determine if they are factually consistent. Here's how you can accomplish the task:"
56
+ "------"
57
+ "##INSTRUCTIONS: "
58
+ "- Focus on the factual consistency between the predicted answer and the correct answer. The predicted answer should not contain any misinterpretations or misinformation.\n"
59
+ "- The predicted answer must be factually accurate and align with the video content.\n"
60
+ "- Consider synonyms or paraphrases as valid matches.\n"
61
+ "- Evaluate the factual accuracy of the prediction compared to the answer."
62
+ },
63
+ {
64
+ "role": "user",
65
+ "content":
66
+ "Please evaluate the following video-based question-answer pair:\n\n"
67
+ f"Question: {question}\n"
68
+ f"Correct Answer: {answer}\n"
69
+ f"Predicted Answer: {pred}\n\n"
70
+ "Provide your evaluation only as a factual accuracy score where the factual accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of factual consistency. "
71
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the factual accuracy score in INTEGER, not STRING."
72
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
73
+ "For example, your response should look like this: {''score': 4.8}."
74
+ }
75
+ ]
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ traceback.print_exc()
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for correctness:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_2_detailed_orientation.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ from tqdm import tqdm
6
+ from multiprocessing.pool import Pool
7
+
8
+ from openai import AzureOpenAI
9
+
10
+
11
+ def init():
12
+ client = AzureOpenAI(
13
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
14
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
15
+ api_version="2024-02-15-preview"
16
+ )
17
+
18
+ return client
19
+
20
+
21
+ def interaction(client, message_text):
22
+ completion = client.chat.completions.create(
23
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
24
+ messages = message_text,
25
+ temperature=0.7,
26
+ max_tokens=800,
27
+ top_p=0.95,
28
+ frequency_penalty=0,
29
+ presence_penalty=0,
30
+ stop=None
31
+ )
32
+
33
+ return completion
34
+
35
+
36
+ def annotate(prediction_set, caption_files, output_dir, args):
37
+ """
38
+ Evaluates question and answer pairs using GPT-3 and
39
+ returns a score for detailed orientation.
40
+ """
41
+ for file in tqdm(caption_files):
42
+ key = file[:-5] # Strip file extension
43
+ qa_set = prediction_set[key]
44
+ question = qa_set['q']
45
+ answer = qa_set['a']
46
+ pred = qa_set['p']
47
+ try:
48
+ # Compute the detailed-orientation score
49
+ message = [
50
+ {
51
+ "role": "system",
52
+ "content":
53
+ "You are an intelligent chatbot designed for evaluating the detail orientation of generative outputs for video-based question-answer pairs. "
54
+ "Your task is to compare the predicted answer with the correct answer and determine its level of detail, considering both completeness and specificity. Here's how you can accomplish the task:"
55
+ "------"
56
+ "##INSTRUCTIONS: "
57
+ "- Check if the predicted answer covers all major points from the video. The response should not leave out any key aspects.\n"
58
+ "- Evaluate whether the predicted answer includes specific details rather than just generic points. It should provide comprehensive information that is tied to specific elements of the video.\n"
59
+ "- Consider synonyms or paraphrases as valid matches.\n"
60
+ "- Provide a single evaluation score that reflects the level of detail orientation of the prediction, considering both completeness and specificity."
61
+ },
62
+ {
63
+ "role": "user",
64
+ "content":
65
+ "Please evaluate the following video-based question-answer pair:\n\n"
66
+ f"Question: {question}\n"
67
+ f"Correct Answer: {answer}\n"
68
+ f"Predicted Answer: {pred}\n\n"
69
+ "Provide your evaluation only as a detail orientation score where the detail orientation score is an integer value between 0 and 5, with 5 indicating the highest level of detail orientation. "
70
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the detail orientation score in INTEGER, not STRING."
71
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
72
+ "For example, your response should look like this: {''score': 4.8}."
73
+ }
74
+ ]
75
+
76
+ completion = interaction(client, message)
77
+ # Convert response to a Python dictionary.
78
+ response_message = completion.choices[0].message.content
79
+ response_dict = ast.literal_eval(response_message)
80
+ result_qa_pair = [response_dict, qa_set]
81
+
82
+ # Save the question-answer pairs to a json file.
83
+ with open(f"{output_dir}/{key}.json", "w") as f:
84
+ json.dump(result_qa_pair, f)
85
+
86
+ except Exception as e:
87
+ print(f"Error processing file '{key}': {e}")
88
+
89
+
90
+ def main(args):
91
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
92
+
93
+ # Dictionary to store the count of occurrences for each video_id
94
+ video_id_counts = {}
95
+ new_pred_contents = []
96
+
97
+ # Iterate through each sample in pred_contents
98
+ for sample in pred_contents:
99
+ video_id = sample['video_name']
100
+ if video_id in video_id_counts:
101
+ video_id_counts[video_id] += 1
102
+ else:
103
+ video_id_counts[video_id] = 0
104
+
105
+ # Create a new sample with the modified key
106
+ new_sample = sample
107
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
108
+ new_pred_contents.append(new_sample)
109
+
110
+ # Generating list of id's and corresponding files
111
+ id_list = [x['video_name'] for x in new_pred_contents]
112
+ caption_files = [f"{id}.json" for id in id_list]
113
+
114
+ output_dir = args.output_dir
115
+ # Generate output directory if not exists.
116
+ if not os.path.exists(output_dir):
117
+ os.makedirs(output_dir)
118
+
119
+ # Preparing dictionary of question-answer sets
120
+ prediction_set = {}
121
+ for sample in new_pred_contents:
122
+ id = sample['video_name']
123
+ question = sample['Q']
124
+ answer = sample['A']
125
+ pred = sample['P']
126
+ qa_set = {"q": question, "a": answer, "p": pred}
127
+ prediction_set[id] = qa_set
128
+
129
+ # Set the OpenAI API key.
130
+ # openai.api_key = args.api_key
131
+ num_tasks = args.num_tasks
132
+
133
+ # While loop to ensure that all captions are processed.
134
+ while True:
135
+ try:
136
+ # Files that have not been processed yet.
137
+ completed_files = os.listdir(output_dir)
138
+ print(f"completed_files: {len(completed_files)}")
139
+
140
+ # Files that have not been processed yet.
141
+ incomplete_files = [f for f in caption_files if f not in completed_files]
142
+ print(f"incomplete_files: {len(incomplete_files)}")
143
+
144
+ # Break the loop when there are no incomplete files
145
+ if len(incomplete_files) == 0:
146
+ break
147
+ if len(incomplete_files) <= num_tasks:
148
+ num_tasks = 1
149
+
150
+ # Split tasks into parts.
151
+ part_len = len(incomplete_files) // num_tasks
152
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
153
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
154
+
155
+ # Use a pool of workers to process the files in parallel.
156
+ with Pool() as pool:
157
+ pool.starmap(annotate, task_args)
158
+
159
+ except Exception as e:
160
+ print(f"Error: {e}")
161
+
162
+ # Combine all the processed files into one
163
+ combined_contents = {}
164
+ json_path = args.output_json
165
+
166
+ # Iterate through json files
167
+ for file_name in tqdm(os.listdir(output_dir)):
168
+ if file_name.endswith(".json"):
169
+ file_path = os.path.join(output_dir, file_name)
170
+ with open(file_path, "r") as json_file:
171
+ content = json.load(json_file)
172
+ combined_contents[file_name[:-5]] = content
173
+
174
+ # Write combined content to a json file
175
+ with open(json_path, "w") as json_file:
176
+ json.dump(combined_contents, json_file)
177
+ print("All evaluation completed!")
178
+
179
+ # Calculate average score
180
+ score_sum = 0
181
+ count = 0
182
+ for key, result in combined_contents.items():
183
+ count += 1
184
+ score_match = result[0]['score']
185
+ score = int(score_match)
186
+ score_sum += score
187
+ average_score = score_sum / count
188
+
189
+ print("Average score for detailed orientation:", average_score)
190
+
191
+
192
+ if __name__ == "__main__":
193
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
194
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
195
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
196
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
197
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
198
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
199
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
200
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
201
+ args = parser.parse_args()
202
+
203
+ # Set the OpenAI API key.
204
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
205
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
206
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
207
+
208
+ client = init()
209
+
210
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_3_context.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for contextual understanding.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question = qa_set['q']
47
+ answer = qa_set['a']
48
+ pred = qa_set['p']
49
+ try:
50
+ # Compute the contextual understanding score
51
+ message = [
52
+ {
53
+ "role": "system",
54
+ "content":
55
+ "You are an intelligent chatbot designed for evaluating the contextual understanding of generative outputs for video-based question-answer pairs. "
56
+ "Your task is to compare the predicted answer with the correct answer and determine if the generated response aligns with the overall context of the video content. Here's how you can accomplish the task:"
57
+ "------"
58
+ "##INSTRUCTIONS: "
59
+ "- Evaluate whether the predicted answer aligns with the overall context of the video content. It should not provide information that is out of context or misaligned.\n"
60
+ "- The predicted answer must capture the main themes and sentiments of the video.\n"
61
+ "- Consider synonyms or paraphrases as valid matches.\n"
62
+ "- Provide your evaluation of the contextual understanding of the prediction compared to the answer."
63
+ },
64
+ {
65
+ "role": "user",
66
+ "content":
67
+ "Please evaluate the following video-based question-answer pair:\n\n"
68
+ f"Question: {question}\n"
69
+ f"Correct Answer: {answer}\n"
70
+ f"Predicted Answer: {pred}\n\n"
71
+ "Provide your evaluation only as a contextual understanding score where the contextual understanding score is an integer value between 0 and 5, with 5 indicating the highest level of contextual understanding. "
72
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is contextual understanding score in INTEGER, not STRING."
73
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
74
+ "For example, your response should look like this: {''score': 4.8}."
75
+ }
76
+ ]
77
+
78
+ completion = interaction(client, message)
79
+ # Convert response to a Python dictionary.
80
+ response_message = completion.choices[0].message.content
81
+ response_dict = ast.literal_eval(response_message)
82
+ result_qa_pair = [response_dict, qa_set]
83
+
84
+ # Save the question-answer pairs to a json file.
85
+ with open(f"{output_dir}/{key}.json", "w") as f:
86
+ json.dump(result_qa_pair, f)
87
+
88
+ except Exception as e:
89
+ print(f"Error processing file '{key}': {e}")
90
+
91
+
92
+ def main(args):
93
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
94
+
95
+ # Dictionary to store the count of occurrences for each video_id
96
+ video_id_counts = {}
97
+ new_pred_contents = []
98
+
99
+ # Iterate through each sample in pred_contents
100
+ for sample in pred_contents:
101
+ video_id = sample['video_name']
102
+ if video_id in video_id_counts:
103
+ video_id_counts[video_id] += 1
104
+ else:
105
+ video_id_counts[video_id] = 0
106
+
107
+ # Create a new sample with the modified key
108
+ new_sample = sample
109
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
110
+ new_pred_contents.append(new_sample)
111
+
112
+ # Generating list of id's and corresponding files
113
+ id_list = [x['video_name'] for x in new_pred_contents]
114
+ caption_files = [f"{id}.json" for id in id_list]
115
+
116
+ output_dir = args.output_dir
117
+ # Generate output directory if not exists.
118
+ if not os.path.exists(output_dir):
119
+ os.makedirs(output_dir)
120
+
121
+ # Preparing dictionary of question-answer sets
122
+ prediction_set = {}
123
+ for sample in new_pred_contents:
124
+ id = sample['video_name']
125
+ question = sample['Q']
126
+ answer = sample['A']
127
+ pred = sample['P']
128
+ qa_set = {"q": question, "a": answer, "p": pred}
129
+ prediction_set[id] = qa_set
130
+
131
+ # Set the OpenAI API key.
132
+ # openai.api_key = args.api_key
133
+ num_tasks = args.num_tasks
134
+
135
+ # While loop to ensure that all captions are processed.
136
+ while True:
137
+ try:
138
+ # Files that have not been processed yet.
139
+ completed_files = os.listdir(output_dir)
140
+ print(f"completed_files: {len(completed_files)}")
141
+
142
+ # Files that have not been processed yet.
143
+ incomplete_files = [f for f in caption_files if f not in completed_files]
144
+ print(f"incomplete_files: {len(incomplete_files)}")
145
+
146
+ # Break the loop when there are no incomplete files
147
+ if len(incomplete_files) == 0:
148
+ break
149
+ if len(incomplete_files) <= num_tasks:
150
+ num_tasks = 1
151
+
152
+ # Split tasks into parts.
153
+ part_len = len(incomplete_files) // num_tasks
154
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
155
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
156
+
157
+ # Use a pool of workers to process the files in parallel.
158
+ with Pool() as pool:
159
+ pool.starmap(annotate, task_args)
160
+
161
+ except Exception as e:
162
+ print(f"Error: {e}")
163
+
164
+ # Combine all the processed files into one
165
+ combined_contents = {}
166
+ json_path = args.output_json
167
+
168
+ # Iterate through json files
169
+ for file_name in tqdm(os.listdir(output_dir)):
170
+ if file_name.endswith(".json"):
171
+ file_path = os.path.join(output_dir, file_name)
172
+ with open(file_path, "r") as json_file:
173
+ content = json.load(json_file)
174
+ combined_contents[file_name[:-5]] = content
175
+
176
+ # Write combined content to a json file
177
+ with open(json_path, "w") as json_file:
178
+ json.dump(combined_contents, json_file)
179
+ print("All evaluation completed!")
180
+
181
+ # Calculate average score
182
+ score_sum = 0
183
+ count = 0
184
+ for key, result in combined_contents.items():
185
+ count += 1
186
+ score_match = result[0]['score']
187
+ score = int(score_match)
188
+ score_sum += score
189
+ average_score = score_sum / count
190
+
191
+ print("Average score for contextual understanding:", average_score)
192
+
193
+
194
+ if __name__ == "__main__":
195
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
196
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
197
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
198
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
199
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
200
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
201
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
202
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
203
+ args = parser.parse_args()
204
+
205
+ # Set the OpenAI API key.
206
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
207
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
208
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
209
+
210
+ client = init()
211
+
212
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_4_temporal.py ADDED
@@ -0,0 +1,206 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+
39
+ for file in tqdm(caption_files):
40
+ key = file[:-5] # Strip file extension
41
+ qa_set = prediction_set[key]
42
+ question = qa_set['q']
43
+ answer = qa_set['a']
44
+ pred = qa_set['p']
45
+ try:
46
+ message = [
47
+ {
48
+ "role": "system",
49
+ "content":
50
+ "You are an intelligent chatbot designed for evaluating the temporal understanding of generative outputs for video-based question-answer pairs. "
51
+ "Your task is to compare the predicted answer with the correct answer and determine if they correctly reflect the temporal sequence of events in the video content. Here's how you can accomplish the task:"
52
+ "------"
53
+ "##INSTRUCTIONS: "
54
+ "- Focus on the temporal consistency between the predicted answer and the correct answer. The predicted answer should correctly reflect the sequence of events or details as they are presented in the video content.\n"
55
+ "- Consider synonyms or paraphrases as valid matches, but only if the temporal order is maintained.\n"
56
+ "- Evaluate the temporal accuracy of the prediction compared to the answer."
57
+ },
58
+ {
59
+ "role": "user",
60
+ "content":
61
+ "Please evaluate the following video-based question-answer pair:\n\n"
62
+ f"Question: {question}\n"
63
+ f"Correct Answer: {answer}\n"
64
+ f"Predicted Answer: {pred}\n\n"
65
+ "Provide your evaluation only as a temporal accuracy score where the temporal accuracy score is an integer value between 0 and 5, with 5 indicating the highest level of temporal consistency. "
66
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the temporal accuracy score in INTEGER, not STRING."
67
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
68
+ "For example, your response should look like this: {''score': 4.8}."
69
+ }
70
+ ]
71
+
72
+ completion = interaction(client, message)
73
+ # Convert response to a Python dictionary.
74
+ response_message = completion.choices[0].message.content
75
+ response_dict = ast.literal_eval(response_message)
76
+ result_qa_pair = [response_dict, qa_set]
77
+
78
+ # Save the question-answer pairs to a json file.
79
+ with open(f"{output_dir}/{key}.json", "w") as f:
80
+ json.dump(result_qa_pair, f)
81
+
82
+ except Exception as e:
83
+ print(f"Error processing file '{key}': {e}")
84
+
85
+
86
+ def main(args):
87
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
88
+
89
+ # Dictionary to store the count of occurrences for each video_id
90
+ video_id_counts = {}
91
+ new_pred_contents = []
92
+
93
+ # Iterate through each sample in pred_contents
94
+ for sample in pred_contents:
95
+ video_id = sample['video_name']
96
+ if video_id in video_id_counts:
97
+ video_id_counts[video_id] += 1
98
+ else:
99
+ video_id_counts[video_id] = 0
100
+
101
+ # Create a new sample with the modified key
102
+ new_sample = sample
103
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
104
+ new_pred_contents.append(new_sample)
105
+
106
+ # Generating list of id's and corresponding files
107
+ id_list = [x['video_name'] for x in new_pred_contents]
108
+ caption_files = [f"{id}.json" for id in id_list]
109
+
110
+ output_dir = args.output_dir
111
+ # Generate output directory if not exists.
112
+ if not os.path.exists(output_dir):
113
+ os.makedirs(output_dir)
114
+
115
+ # Preparing dictionary of question-answer sets
116
+ prediction_set = {}
117
+ for sample in new_pred_contents:
118
+ id = sample['video_name']
119
+ question = sample['Q']
120
+ answer = sample['A']
121
+ pred = sample['P']
122
+ qa_set = {"q": question, "a": answer, "p": pred}
123
+ prediction_set[id] = qa_set
124
+
125
+ # Set the OpenAI API key.
126
+ # openai.api_key = args.api_key
127
+ num_tasks = args.num_tasks
128
+
129
+ # While loop to ensure that all captions are processed.
130
+ while True:
131
+ try:
132
+ # Files that have not been processed yet.
133
+ completed_files = os.listdir(output_dir)
134
+ print(f"completed_files: {len(completed_files)}")
135
+
136
+ # Files that have not been processed yet.
137
+ incomplete_files = [f for f in caption_files if f not in completed_files]
138
+ print(f"incomplete_files: {len(incomplete_files)}")
139
+
140
+ # Break the loop when there are no incomplete files
141
+ if len(incomplete_files) == 0:
142
+ break
143
+ if len(incomplete_files) <= num_tasks:
144
+ num_tasks = 1
145
+
146
+ # Split tasks into parts.
147
+ part_len = len(incomplete_files) // num_tasks
148
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
149
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
150
+
151
+ # Use a pool of workers to process the files in parallel.
152
+ with Pool() as pool:
153
+ pool.starmap(annotate, task_args)
154
+
155
+ except Exception as e:
156
+ print(f"Error: {e}")
157
+
158
+ # Combine all the processed files into one
159
+ combined_contents = {}
160
+ json_path = args.output_json
161
+
162
+ # Iterate through json files
163
+ for file_name in os.listdir(output_dir):
164
+ if file_name.endswith(".json"):
165
+ file_path = os.path.join(output_dir, file_name)
166
+ with open(file_path, "r") as json_file:
167
+ content = json.load(json_file)
168
+ combined_contents[file_name[:-5]] = content
169
+
170
+ # Write combined content to a json file
171
+ with open(json_path, "w") as json_file:
172
+ json.dump(combined_contents, json_file)
173
+ print("All evaluation completed!")
174
+
175
+ # Calculate average score
176
+ score_sum = 0
177
+ count = 0
178
+ for key, result in combined_contents.items():
179
+ count += 1
180
+ score_match = result[0]['score']
181
+ score = int(score_match)
182
+ score_sum += score
183
+ average_score = score_sum / count
184
+
185
+ print("Average score temporal understanding:", average_score)
186
+
187
+
188
+ if __name__ == "__main__":
189
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
190
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
191
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
192
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
193
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
194
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
195
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
196
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
197
+ args = parser.parse_args()
198
+
199
+ # Set the OpenAI API key.
200
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
201
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
202
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
203
+
204
+ client = init()
205
+
206
+ main(args)
videollama2/eval/eval_video_oqa_vcgpt_5_consistency.py ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import ast
5
+ import traceback
6
+ from tqdm import tqdm
7
+ from multiprocessing.pool import Pool
8
+
9
+ from openai import AzureOpenAI
10
+
11
+
12
+ def init():
13
+ client = AzureOpenAI(
14
+ azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"),
15
+ api_key=os.getenv("AZURE_OPENAI_KEY"),
16
+ api_version="2024-02-15-preview"
17
+ )
18
+
19
+ return client
20
+
21
+
22
+ def interaction(client, message_text):
23
+ completion = client.chat.completions.create(
24
+ model=os.getenv("AZURE_OPENAI_DEPLOYNAME"),
25
+ messages = message_text,
26
+ temperature=0.7,
27
+ max_tokens=800,
28
+ top_p=0.95,
29
+ frequency_penalty=0,
30
+ presence_penalty=0,
31
+ stop=None
32
+ )
33
+
34
+ return completion
35
+
36
+
37
+ def annotate(prediction_set, caption_files, output_dir, args):
38
+ """
39
+ Evaluates question and answer pairs using GPT-3 and
40
+ returns a score for consistency.
41
+ """
42
+
43
+ for file in tqdm(caption_files):
44
+ key = file[:-5] # Strip file extension
45
+ qa_set = prediction_set[key]
46
+ question1 = qa_set['q1']
47
+ question2 = qa_set['q2']
48
+ answer = qa_set['a']
49
+ pred1 = qa_set['p1']
50
+ pred2 = qa_set['p2']
51
+ try:
52
+ message = [
53
+ {
54
+ "role": "system",
55
+ "content":
56
+ "You are an intelligent chatbot designed for evaluating the consistency of generative outputs for similar video-based question-answer pairs. "
57
+ "You will be given two very similar questions, a common answer common to both the questions and predicted answers for the two questions ."
58
+ "Your task is to compare the predicted answers for two very similar question, with a common correct answer and determine if they are consistent. Here's how you can accomplish the task:"
59
+ "------"
60
+ "##INSTRUCTIONS: "
61
+ "- Focus on the consistency between the two predicted answers and the correct answer. Both predicted answers should correspond to the correct answer and to each other, and should not contain any contradictions or significant differences in the conveyed information.\n"
62
+ "- Both predicted answers must be consistent with each other and the correct answer, in terms of the information they provide about the video content.\n"
63
+ "- Consider synonyms or paraphrases as valid matches, but only if they maintain the consistency in the conveyed information.\n"
64
+ "- Evaluate the consistency of the two predicted answers compared to the correct answer."
65
+ },
66
+ {
67
+ "role": "user",
68
+ "content":
69
+ "Please evaluate the following video-based question-answer pair:\n\n"
70
+ f"Question 1: {question1}\n"
71
+ f"Question 2: {question2}\n"
72
+ f"Correct Answer: {answer}\n"
73
+ f"Predicted Answer to Question 1: {pred1}\n"
74
+ f"Predicted Answer to Question 2: {pred2}\n\n"
75
+ "Provide your evaluation only as a consistency score where the consistency score is an integer value between 0 and 5, with 5 indicating the highest level of consistency. "
76
+ "Please generate the response in the form of a Python dictionary string with keys 'score', where its value is the consistency score in INTEGER, not STRING."
77
+ "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
78
+ "For example, your response should look like this: {''score': 4.8}."
79
+ }
80
+ ]
81
+
82
+ completion = interaction(client, message)
83
+ # Convert response to a Python dictionary.
84
+ response_message = completion.choices[0].message.content
85
+ response_dict = ast.literal_eval(response_message)
86
+ result_qa_pair = [response_dict, qa_set]
87
+
88
+ # Save the question-answer pairs to a json file.
89
+ with open(f"{output_dir}/{key}.json", "w") as f:
90
+ json.dump(result_qa_pair, f)
91
+
92
+ except Exception as e:
93
+ print(f"Error processing file '{key}': {e}")
94
+
95
+
96
+ def main(args):
97
+ pred_contents = [eval(line) for line in open(args.pred_path, 'r').readlines()]
98
+
99
+ # Dictionary to store the count of occurrences for each video_id
100
+ video_id_counts = {}
101
+ new_pred_contents = []
102
+
103
+ # Iterate through each sample in pred_contents
104
+ for sample in pred_contents:
105
+ video_id = sample['video_name']
106
+ if video_id in video_id_counts:
107
+ video_id_counts[video_id] += 1
108
+ else:
109
+ video_id_counts[video_id] = 0
110
+
111
+ # Create a new sample with the modified key
112
+ new_sample = sample
113
+ new_sample['video_name'] = f"{video_id}_{video_id_counts[video_id]}"
114
+ new_pred_contents.append(new_sample)
115
+
116
+ # Generating list of id's and corresponding files
117
+ id_list = [x['video_name'] for x in new_pred_contents]
118
+ caption_files = [f"{id}.json" for id in id_list]
119
+
120
+ output_dir = args.output_dir
121
+ # Generate output directory if not exists.
122
+ if not os.path.exists(output_dir):
123
+ os.makedirs(output_dir)
124
+
125
+ # Preparing dictionary of question-answer sets
126
+ prediction_set = {}
127
+ for sample in new_pred_contents:
128
+ id = sample['video_name']
129
+ question1 = sample['Q1']
130
+ question2 = sample['Q2']
131
+ answer = sample['A']
132
+ pred1 = sample['P1']
133
+ pred2 = sample['P2']
134
+ qa_set = {"q1": question1, "q2": question2, "a": answer, "p1": pred1, "p2": pred2}
135
+ prediction_set[id] = qa_set
136
+
137
+ # Set the OpenAI API key.
138
+ # openai.api_key = args.api_key
139
+ num_tasks = args.num_tasks
140
+
141
+ # While loop to ensure that all captions are processed.
142
+ while True:
143
+ try:
144
+ # Files that have not been processed yet.
145
+ completed_files = os.listdir(output_dir)
146
+ print(f"completed_files: {len(completed_files)}")
147
+
148
+ # Files that have not been processed yet.
149
+ incomplete_files = [f for f in caption_files if f not in completed_files]
150
+ print(f"incomplete_files: {len(incomplete_files)}")
151
+
152
+ # Break the loop when there are no incomplete files
153
+ if len(incomplete_files) == 0:
154
+ break
155
+ if len(incomplete_files) <= num_tasks:
156
+ num_tasks = 1
157
+
158
+ # Split tasks into parts.
159
+ part_len = len(incomplete_files) // num_tasks
160
+ all_parts = [incomplete_files[i:i + part_len] for i in range(0, len(incomplete_files), part_len)]
161
+ task_args = [(prediction_set, part, args.output_dir, args) for part in all_parts]
162
+
163
+ # Use a pool of workers to process the files in parallel.
164
+ with Pool() as pool:
165
+ pool.starmap(annotate, task_args)
166
+
167
+ except Exception as e:
168
+ print(f"Error: {e}")
169
+
170
+ # Combine all the processed files into one
171
+ combined_contents = {}
172
+ json_path = args.output_json
173
+
174
+ # Iterate through json files
175
+ for file_name in os.listdir(output_dir):
176
+ if file_name.endswith(".json"):
177
+ file_path = os.path.join(output_dir, file_name)
178
+ with open(file_path, "r") as json_file:
179
+ content = json.load(json_file)
180
+ combined_contents[file_name[:-5]] = content
181
+
182
+ # Write combined content to a json file
183
+ with open(json_path, "w") as json_file:
184
+ json.dump(combined_contents, json_file)
185
+ print("All evaluation completed!")
186
+
187
+ # Calculate average score
188
+ score_sum = 0
189
+ count = 0
190
+ for key, result in combined_contents.items():
191
+ count += 1
192
+ score_match = result[0]['score']
193
+ score = int(score_match)
194
+ score_sum += score
195
+ average_score = score_sum / count
196
+
197
+ print("Average score for consistency:", average_score)
198
+
199
+
200
+ if __name__ == "__main__":
201
+ parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
202
+ parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
203
+ parser.add_argument("--output-dir", required=True, help="The path to save annotation json files.")
204
+ parser.add_argument("--output-json", required=True, help="The path to save annotation final combined json file.")
205
+ parser.add_argument("--num-tasks", required=True, type=int, help="Number of splits.")
206
+ parser.add_argument("--api-key", required=True, type=str, help="Azure Openai API key.")
207
+ parser.add_argument("--api-endpoint", required=True, type=str, help="Azure Openai API endpoint.")
208
+ parser.add_argument("--api-deployname", required=True, type=str, help="Azure Openai API deployname.")
209
+ args = parser.parse_args()
210
+
211
+ # Set the OpenAI API key.
212
+ os.environ["AZURE_OPENAI_KEY"] = args.api_key
213
+ os.environ["AZURE_OPENAI_ENDPOINT"] = args.api_endpoint
214
+ os.environ["AZURE_OPENAI_DEPLOYNAME"] = args.api_deployname
215
+
216
+ client = init()
217
+
218
+ main(args)
videollama2/eval/inference_video_cap_msvc.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import os
3
+ import argparse
4
+ import json
5
+ import warnings
6
+ from tqdm import tqdm
7
+
8
+ from torch.utils.data import Dataset, DataLoader
9
+
10
+ import sys
11
+ sys.path.append('./')
12
+ from videollama2 import model_init, mm_infer
13
+ from videollama2.utils import disable_torch_init
14
+
15
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
16
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
17
+
18
+
19
+ def split_list(lst, n):
20
+ """Split a list into n (roughly) equal-sized chunks"""
21
+ chunk_size = math.ceil(len(lst) / n) # integer division
22
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
23
+
24
+
25
+ def get_chunk(lst, n, k):
26
+ chunks = split_list(lst, n)
27
+ return chunks[k]
28
+
29
+
30
+ class MSVCDataset(Dataset):
31
+
32
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
33
+
34
+ def __init__(self, folder, questions, processor):
35
+ self.folder = folder
36
+ self.questions = questions
37
+ self.processor = processor
38
+
39
+ def __len__(self):
40
+ return len(self.questions)
41
+
42
+ def __getitem__(self, idx):
43
+ sample = self.questions[idx]
44
+
45
+ video_name = sample['video_path']
46
+ question = sample['question']
47
+ answer = sample['captions']
48
+
49
+ video_path = os.path.join(self.folder, video_name)
50
+ video_tensor = self.processor(video_path)
51
+
52
+ return {
53
+ 'video': video_tensor,
54
+ 'video_name': video_name,
55
+ 'question': question,
56
+ 'answer': answer,
57
+ }
58
+
59
+
60
+ def collate_fn(batch):
61
+ vid = [x['video'] for x in batch]
62
+ v_id = [x['video_name'] for x in batch]
63
+ qus = [x['question'] for x in batch]
64
+ ans = [x['answer'] for x in batch]
65
+ return vid, v_id, qus, ans
66
+
67
+
68
+ def run_inference(args):
69
+ disable_torch_init()
70
+
71
+ model, processor, tokenizer = model_init(args.model_path)
72
+
73
+ gt_questions = json.load(open(args.question_file, "r"))
74
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
75
+
76
+ answer_file = os.path.join(args.output_file)
77
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
78
+ ans_file = open(answer_file, "w")
79
+
80
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
81
+ dataset = MSVCDataset(args.video_folder, gt_questions, processor['video'])
82
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
83
+
84
+ # Iterate over each sample in the ground truth file
85
+ for idx, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
86
+ video_tensor = video_tensors[0]
87
+ video_name = video_names[0]
88
+ question = questions[0]
89
+ answer = answers[0]
90
+
91
+ output = mm_infer(
92
+ video_tensor,
93
+ question,
94
+ model=model,
95
+ tokenizer=tokenizer,
96
+ modal='video',
97
+ do_sample=False,
98
+ )
99
+
100
+ sample_set = {'video_name': video_name, 'question': question, 'answer': answer, 'pred': output}
101
+ ans_file.write(json.dumps(sample_set) + "\n")
102
+
103
+ ans_file.close()
104
+
105
+
106
+ if __name__ == "__main__":
107
+ parser = argparse.ArgumentParser()
108
+
109
+ parser.add_argument('--model-path', help='', required=True)
110
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
111
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
112
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
113
+ parser.add_argument("--num-chunks", type=int, default=1)
114
+ parser.add_argument("--chunk-idx", type=int, default=0)
115
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
116
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
117
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
118
+ args = parser.parse_args()
119
+
120
+ run_inference(args)
videollama2/eval/inference_video_mcqa_egoschema.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ import traceback
8
+
9
+ from tqdm import tqdm
10
+ from torch.utils.data import Dataset, DataLoader
11
+
12
+ import sys
13
+ sys.path.append('./')
14
+ from videollama2 import model_init, mm_infer
15
+ from videollama2.utils import disable_torch_init
16
+
17
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
18
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
19
+
20
+
21
+ def split_list(lst, n):
22
+ """Split a list into n (roughly) equal-sized chunks"""
23
+ chunk_size = math.ceil(len(lst) / n) # integer division
24
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
25
+
26
+
27
+ def get_chunk(lst, n, k):
28
+ chunks = split_list(lst, n)
29
+ return chunks[k]
30
+
31
+
32
+ class EgoschemaDataset(Dataset):
33
+
34
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
35
+
36
+ def __init__(self, data_folder, data_list, processor):
37
+ self.data_folder = data_folder
38
+ self.data_list = data_list
39
+ self.processor = processor
40
+
41
+ def __len__(self):
42
+ return len(self.data_list)
43
+
44
+ def __getitem__(self, idx):
45
+ line = self.data_list[idx]
46
+ q_uid = line['q_uid']
47
+
48
+ for fmt in self.video_formats: # Added this line
49
+ temp_path = os.path.join(self.data_folder, f"{q_uid}{fmt}")
50
+ if os.path.exists(temp_path):
51
+ video_path = temp_path
52
+ break
53
+
54
+ video_tensor = self.processor(video_path)
55
+
56
+ question = line['question']
57
+ a0 = line['option 0']
58
+ a1 = line['option 1']
59
+ a2 = line['option 2']
60
+ a3 = line['option 3']
61
+ a4 = line['option 4']
62
+ axs = [a0, a1, a2, a3, a4]
63
+ ops = ['(A)', '(B)', '(C)', '(D)', '(E)']
64
+
65
+ instruct = f'Select the best answer to the following multiple-choice question based on the video.\n{question}\nOptions:\n(A) {a0}\n(B) {a1}\n(C) {a2}\n(D) {a3}\n(E) {a4}\nAnswer with the option\'s letter from the given choices directly and only give the best option. The best answer is: '
66
+
67
+ return {
68
+ 'q_uid': q_uid,
69
+ 'video': video_tensor,
70
+ 'instruct': instruct,
71
+ }
72
+
73
+
74
+ def build_egoschema_eval(args, processor):
75
+ questions = json.load(open(args.question_file, "r"))
76
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
77
+ dataset = EgoschemaDataset(args.video_folder, questions, processor)
78
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
79
+
80
+ return dataloader
81
+
82
+
83
+ def egoschema_dump(ans_file, line, outputs):
84
+ for idx, output in enumerate(outputs):
85
+ q_uid = line['q_uid'][idx]
86
+ instruct = line['instruct'][idx]
87
+ letters = ['A', 'B', 'C', 'D', 'E']
88
+
89
+ output = output.replace('answer', '')
90
+ output = output.replace('Answer', '')
91
+ pred_answer = re.findall('[\(\ ]*[A-E][\)\ ]*', output)
92
+ try:
93
+
94
+ assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(line['q_uid'], instruct, output)
95
+ pred_answer = pred_answer[0].strip()
96
+ pred_answer = pred_answer.strip('()')
97
+ pred_idx = letters.index(pred_answer)
98
+ except:
99
+ traceback.print_exc()
100
+ pred_idx = 2
101
+
102
+ ans_file.write(f'{q_uid}, {pred_idx}\n')
103
+
104
+
105
+ def run_inference(args):
106
+ disable_torch_init()
107
+
108
+ model, processor, tokenizer = model_init(args.model_path)
109
+
110
+ answer_file = os.path.expanduser(args.answer_file)
111
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
112
+ ans_file = open(answer_file, "w")
113
+
114
+ val_loader = build_egoschema_eval(args, processor['video'])
115
+
116
+ # Iterate over each sample in the ground truth file
117
+ for i, line in enumerate(tqdm(val_loader)):
118
+ video_tensor = line['video'][0]
119
+ instruct = line['instruct'][0]
120
+
121
+ try:
122
+ pred = mm_infer(
123
+ video_tensor,
124
+ instruct,
125
+ model=model,
126
+ tokenizer=tokenizer,
127
+ modal='video',
128
+ do_sample=False,
129
+ )
130
+ except:
131
+ traceback.print_exc()
132
+ pred = 'C'
133
+
134
+ egoschema_dump(ans_file, line, [pred])
135
+
136
+ ans_file.close()
137
+
138
+
139
+ if __name__ == "__main__":
140
+ parser = argparse.ArgumentParser(description='Multiple-Choice Video QA Evaluation Script.')
141
+
142
+ parser.add_argument('--model-path', help='', required=True)
143
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
144
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
145
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
146
+ parser.add_argument("--num-chunks", type=int, default=1)
147
+ parser.add_argument("--chunk-idx", type=int, default=0)
148
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
149
+ parser.add_argument("--batch-size", type=int, default=1)
150
+ parser.add_argument("--num-workers", type=int, default=8)
151
+ args = parser.parse_args()
152
+
153
+ run_inference(args)
videollama2/eval/inference_video_mcqa_mvbench.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ import traceback
8
+
9
+ import torch
10
+ import numpy as np
11
+ from PIL import Image
12
+ from tqdm import tqdm
13
+ from decord import VideoReader, cpu
14
+ from torch.utils.data import Dataset, DataLoader
15
+
16
+ import sys
17
+ sys.path.append('./')
18
+ from videollama2 import model_init, mm_infer
19
+ from videollama2.utils import disable_torch_init
20
+
21
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
22
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
23
+
24
+
25
+ def split_list(lst, n):
26
+ """Split a list into n (roughly) equal-sized chunks"""
27
+ chunk_size = math.ceil(len(lst) / n) # integer division
28
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
29
+
30
+
31
+ def get_chunk(lst, n, k):
32
+ chunks = split_list(lst, n)
33
+ return chunks[k]
34
+
35
+
36
+ class MVBenchDataset(Dataset):
37
+
38
+ def __init__(self, data_list, processor):
39
+ self.data_list = data_list
40
+ self.processor = processor
41
+
42
+ def __len__(self):
43
+ return len(self.data_list)
44
+
45
+ def __getitem__(self, idx):
46
+ bound = (None, None)
47
+ if self.data_list[idx]['bound']:
48
+ bound = (self.data_list[idx]['data']['start'], self.data_list[idx]['data']['end'])
49
+ video_path = os.path.join(self.data_list[idx]['prefix'], self.data_list[idx]['data']['video'])
50
+ torch_imgs = self.processor(video_path, s=bound[0], e=bound[1])
51
+ question = self.data_list[idx]['data']['question']
52
+ options = self.data_list[idx]['data']['candidates']
53
+ answer = self.data_list[idx]['data']['answer']
54
+ task_type = self.data_list[idx]['task_type']
55
+
56
+ answer_idx = -1
57
+ letters = []
58
+ options_string = ''
59
+ for option_idx, c in enumerate(options):
60
+ letters.append(f"{chr(ord('A') + option_idx)}")
61
+ options_string += f"({chr(ord('A') + option_idx)}) {c}\n"
62
+ if c == answer:
63
+ answer_idx = option_idx
64
+
65
+ instruct = f'Question: {question}\nOptions:\n{options_string}Answer with the option\'s letter from the given choices directly and only give the best option.'
66
+
67
+ return {
68
+ 'video': torch_imgs,
69
+ 'video_path': video_path,
70
+ 'instruct': instruct,
71
+ 'letters': letters,
72
+ 'options': options,
73
+ 'answer_idx': answer_idx,
74
+ 'task_type': task_type
75
+ }
76
+
77
+
78
+ tasks = {
79
+ "Action Sequence": ("action_sequence.json", "star/Charades_v1_480/", "video", True), # has start & end
80
+ "Action Prediction": ("action_prediction.json", "star/Charades_v1_480/", "video", True), # has start & end
81
+ "Action Antonym": ("action_antonym.json", "ssv2_video/", "video", False),
82
+ "Fine-grained Action": ("fine_grained_action.json", "Moments_in_Time_Raw/videos/", "video", False),
83
+ "Unexpected Action": ("unexpected_action.json", "FunQA_test/test/", "video", False),
84
+ "Object Existence": ("object_existence.json", "clevrer/video_validation/", "video", False),
85
+ "Object Interaction": ("object_interaction.json", "star/Charades_v1_480/", "video", True), # has start & end
86
+ "Object Shuffle": ("object_shuffle.json", "perception/videos/", "video", False),
87
+ "Moving Direction": ("moving_direction.json", "clevrer/video_validation/", "video", False),
88
+ "Action Localization": ("action_localization.json", "sta/sta_video/", "video", True), # has start & end
89
+ "Scene Transition": ("scene_transition.json", "scene_qa/video/", "video", False),
90
+ "Action Count": ("action_count.json", "perception/videos/", "video", False),
91
+ "Moving Count": ("moving_count.json", "clevrer/video_validation/", "video", False),
92
+ "Moving Attribute": ("moving_attribute.json", "clevrer/video_validation/", "video", False),
93
+ "State Change": ("state_change.json", "perception/videos/", "video", False),
94
+ "Fine-grained Pose": ("fine_grained_pose.json", "nturgbd/", "video", False),
95
+ "Character Order": ("character_order.json", "perception/videos/", "video", False),
96
+ "Egocentric Navigation": ("egocentric_navigation.json", "vlnqa/", "video", False),
97
+ "Episodic Reasoning": ("episodic_reasoning.json", "tvqa/frames_fps3_hq/", "frame", True), # has start & end, read frame
98
+ "Counterfactual Inference": ("counterfactual_inference.json", "clevrer/video_validation/", "video", False),
99
+ }
100
+
101
+
102
+ def build_mvbench_eval(args, processor):
103
+ data_list = []
104
+ for task_name, task in tasks.items():
105
+ json_file = os.path.join(args.question_file, task[0])
106
+ vis_folder = os.path.join(args.video_folder, task[1])
107
+ with open(json_file, 'r') as f:
108
+ json_data = json.load(f)
109
+ for data in json_data:
110
+ data_list.append({
111
+ 'task_type': task_name,
112
+ 'prefix': vis_folder,
113
+ 'data_type': task[2],
114
+ 'bound': task[3],
115
+ 'data': data
116
+ })
117
+ data_list = get_chunk(data_list, args.num_chunks, args.chunk_idx)
118
+ dataset = MVBenchDataset(data_list, processor)
119
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers)
120
+
121
+ return dataloader
122
+
123
+
124
+ def mvbench_dump(vid, instruct, letters, options, output):
125
+
126
+ output = output.replace('answer', '')
127
+ output = output.replace('Answer', '')
128
+ pred_answer = re.findall(f'[\(,\ ]*[{letters[0]}-{letters[-1]}][\),\ ]*', output)
129
+ try:
130
+ find_flag = False
131
+ if len(pred_answer) == 0:
132
+ for idx, opt in enumerate(options):
133
+ # Arabic numerals -> English words
134
+ if opt.lower() in output.lower():
135
+ pred_idx = idx
136
+ find_flag = True
137
+ break
138
+ else:
139
+ pred_answer = pred_answer[0].strip()
140
+ pred_answer = pred_answer.strip('()')
141
+ pred_idx = letters.index(pred_answer)
142
+ find_flag = True
143
+
144
+ assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(vid, instruct, output)
145
+ except:
146
+ traceback.print_exc()
147
+ pred_idx = 2
148
+
149
+ return pred_idx
150
+
151
+
152
+ def run_inference(args):
153
+ disable_torch_init()
154
+
155
+ model, processor, tokenizer = model_init(args.model_path)
156
+
157
+ answer_file = os.path.expanduser(args.answer_file)
158
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
159
+ ans_file = open(answer_file, "w")
160
+
161
+ val_loader = build_mvbench_eval(args, processor['video'])
162
+
163
+ # NOTE: only support batch size 1 for now
164
+ for i, line in enumerate(tqdm(val_loader)):
165
+ vid = line['video_path'][0]
166
+ video_tensor = line['video'][0]
167
+ task_type = line['task_type'][0]
168
+ instruct = line['instruct'][0]
169
+ letters = list(zip(*line['letters']))[0]
170
+ options = list(zip(*line['options']))[0]
171
+ answer_idx = line['answer_idx'][0].item()
172
+
173
+ output = mm_infer(
174
+ video_tensor,
175
+ instruct,
176
+ model=model,
177
+ tokenizer=tokenizer,
178
+ modal='video',
179
+ do_sample=False,
180
+ )
181
+
182
+ pred_idx = mvbench_dump(vid, instruct, letters, options, output)
183
+
184
+ ans_file.write(json.dumps({"vid": vid, "task_type": task_type, "pred": pred_idx, "gt": answer_idx}) + '\n')
185
+
186
+ ans_file.close()
187
+
188
+
189
+ if __name__ == "__main__":
190
+ parser = argparse.ArgumentParser()
191
+
192
+ parser.add_argument('--model-path', help='', required=True)
193
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
194
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
195
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
196
+ parser.add_argument("--num-chunks", type=int, default=1)
197
+ parser.add_argument("--chunk-idx", type=int, default=0)
198
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
199
+ parser.add_argument("--batch-size", type=int, default=1)
200
+ parser.add_argument("--num-workers", type=int, default=8)
201
+ args = parser.parse_args()
202
+
203
+ run_inference(args)
videollama2/eval/inference_video_mcqa_perception_test_mcqa.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ import traceback
8
+ from tqdm import tqdm
9
+
10
+ import torch
11
+ from torch.utils.data import Dataset, DataLoader
12
+
13
+ import sys
14
+ sys.path.append('./')
15
+ from videollama2 import model_init, mm_infer
16
+ from videollama2.utils import disable_torch_init
17
+
18
+
19
+ def split_list(lst, n):
20
+ """Split a list into n (roughly) equal-sized chunks"""
21
+ chunk_size = math.ceil(len(lst) / n) # integer division
22
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
23
+
24
+
25
+ def get_chunk(lst, n, k):
26
+ chunks = split_list(lst, n)
27
+ return chunks[k]
28
+
29
+
30
+ class PerceptionTestMCQADataset(Dataset):
31
+
32
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
33
+
34
+ def __init__(self, data_list, processor):
35
+ self.data_list = data_list
36
+ self.processor = processor
37
+
38
+ def __len__(self):
39
+ return len(self.data_list)
40
+
41
+ def __getitem__(self, idx):
42
+ line = self.data_list[idx]
43
+ video_name = line['metadata']['video_id']
44
+ mc_questions = line['mc_question']
45
+
46
+ for fmt in self.video_formats: # Added this line
47
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
48
+ if os.path.exists(temp_path):
49
+ video_path = temp_path
50
+ break
51
+
52
+ video_tensor = self.processor(video_path)
53
+
54
+ instructs = []
55
+ qids = []
56
+ ops = []
57
+ for q in mc_questions:
58
+ question = q['question']
59
+ qid = q['id']
60
+ options = q['options']
61
+ instruct = f'Question: {question}\nOptions:\n(A) {options[0]}\n(B) {options[1]}\n(C) {options[2]}\nAnswer with the option\'s letter from the given choices directly and only give the best option.'
62
+
63
+ instructs.append(instruct)
64
+ qids.append(qid)
65
+ ops.append(options)
66
+
67
+ return {
68
+ 'video': video_tensor,
69
+ 'video_id': video_name,
70
+ 'instructs': instructs,
71
+ 'question_ids': qids,
72
+ 'options': ops,
73
+ }
74
+
75
+
76
+ def collate_fn(batch):
77
+ vid = [x['video'] for x in batch]
78
+ v_id = [x['video_id'] for x in batch]
79
+ ins = [x['instructs'] for x in batch]
80
+ q_ids = [x['question_ids'] for x in batch]
81
+ ops = [x['options'] for x in batch]
82
+ vid = torch.stack(vid, dim=0)
83
+ return vid, v_id, ins, q_ids, ops
84
+
85
+
86
+ def run_inference(args):
87
+ disable_torch_init()
88
+
89
+ model, processor, tokenizer = model_init(args.model_path)
90
+
91
+ questions = json.load(open(args.question_file, "r"))
92
+ questions = list(questions.values())
93
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
94
+
95
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
96
+ dataset = PerceptionTestMCQADataset(questions, processor['video'])
97
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
98
+
99
+ answer_file = os.path.expanduser(args.answer_file)
100
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
101
+ ans_file = open(answer_file, "w")
102
+
103
+ # Iterate over each sample in the ground truth file
104
+ for i, (video_tensor, video_id, instructs, question_ids, options) in enumerate(tqdm(dataloader)):
105
+
106
+ # reduce batch dimension
107
+ video_tensor = video_tensor[0]
108
+ video_id = video_id[0]
109
+ instructs = instructs[0]
110
+ question_ids = question_ids[0]
111
+ options = options[0]
112
+
113
+ qas = []
114
+ for idx, instruct in enumerate(instructs):
115
+ letters = ['(A)', '(B)', '(C)']
116
+ question_id = question_ids[idx]
117
+ _options = options[idx]
118
+
119
+ output = mm_infer(
120
+ video_tensor,
121
+ instruct,
122
+ model=model,
123
+ tokenizer=tokenizer,
124
+ modal='video',
125
+ do_sample=False,
126
+ )
127
+
128
+ output = output.replace('answer', '')
129
+ output = output.replace('Answer', '')
130
+ pred_answer = re.findall('\(*[A-C]\)*', output)
131
+ try:
132
+ assert len(pred_answer) >= 1, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(video_id, instruct, output)
133
+ pred_answer = pred_answer[0].strip()
134
+ # if not pred_answer.startswith('('):
135
+ pred_answer = pred_answer.strip('()')
136
+ pred_answer = f'({pred_answer})'
137
+ pred_idx = letters.index(pred_answer)
138
+ except:
139
+ traceback.print_exc()
140
+ tmp_options = [x.lower() for x in _options]
141
+ if output.lower() in tmp_options:
142
+ tmp_options = [x.lower() for x in _options]
143
+ pred_idx = tmp_options.index(output.lower())
144
+ else:
145
+ pred_idx = 2
146
+
147
+ qas.append({'id': question_id, 'answer_id': pred_idx, 'answer': _options[pred_idx]})
148
+
149
+ ans_file.write('\"{}\": {},\n'.format(video_id, json.dumps(qas)))
150
+
151
+ ans_file.close()
152
+
153
+
154
+ if __name__ == "__main__":
155
+ parser = argparse.ArgumentParser()
156
+
157
+ parser.add_argument('--model-path', help='', required=True)
158
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
159
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
160
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
161
+ parser.add_argument("--num-chunks", type=int, default=1)
162
+ parser.add_argument("--chunk-idx", type=int, default=0)
163
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
164
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
165
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
166
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
167
+ args = parser.parse_args()
168
+
169
+ run_inference(args)
videollama2/eval/inference_video_mcqa_videomme.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import copy
6
+ import argparse
7
+ import warnings
8
+ import traceback
9
+
10
+ import cv2
11
+ import torch
12
+ import pysubs2
13
+ import numpy as np
14
+ import pyarrow.parquet as pq
15
+ from tqdm import tqdm
16
+ from torch.utils.data import Dataset, DataLoader
17
+
18
+ import sys
19
+ sys.path.append('./')
20
+ from videollama2 import model_init, mm_infer
21
+ from videollama2.utils import disable_torch_init
22
+
23
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
24
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
25
+
26
+
27
+ def split_list(lst, n):
28
+ """Split a list into n (roughly) equal-sized chunks"""
29
+ chunk_size = math.ceil(len(lst) / n) # integer division
30
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
31
+
32
+
33
+ def get_chunk(lst, n, k):
34
+ chunks = split_list(lst, n)
35
+ return chunks[k]
36
+
37
+
38
+ def get_seq_frames(total_num_frames, desired_num_frames):
39
+ """
40
+ Calculate the indices of frames to extract from a video.
41
+
42
+ Parameters:
43
+ total_num_frames (int): Total number of frames in the video.
44
+ desired_num_frames (int): Desired number of frames to extract.
45
+
46
+ Returns:
47
+ list: List of indices of frames to extract.
48
+ """
49
+
50
+ # Calculate the size of each segment from which a frame will be extracted
51
+ seg_size = float(total_num_frames - 1) / desired_num_frames
52
+
53
+ seq = []
54
+ for i in range(desired_num_frames):
55
+ # Calculate the start and end indices of each segment
56
+ start = int(np.round(seg_size * i))
57
+ end = int(np.round(seg_size * (i + 1)))
58
+
59
+ # Append the middle index of the segment to the list
60
+ seq.append((start + end) // 2)
61
+
62
+ return seq
63
+
64
+
65
+ class VideoMMEDataset(Dataset):
66
+
67
+ video_formats = ['.mp4', '.avi', '.mov', '.mkv']
68
+
69
+ def __init__(self, video_folder, subtitle_folder, data_list, processor):
70
+ self.video_folder = video_folder
71
+ self.subtitle_folder = subtitle_folder
72
+ self.data_list = data_list
73
+ self.processor = processor
74
+
75
+ def __len__(self):
76
+ return len(self.data_list)
77
+
78
+ def __getitem__(self, idx):
79
+ line = self.data_list[idx]
80
+
81
+ video_ytid = line['url'].split('watch?v=')[-1]
82
+
83
+ for fmt in self.video_formats: # Added this line
84
+ temp_path = os.path.join(self.video_folder, f'{video_ytid}{fmt}')
85
+ if os.path.exists(temp_path):
86
+ video_path = temp_path
87
+ break
88
+
89
+ subtitle_path = os.path.join(self.subtitle_folder, f'{video_ytid}.srt')
90
+
91
+ try:
92
+ video_tensor = self.processor(video_path)
93
+ num_frames = video_tensor.shape[0]
94
+ except:
95
+ traceback.print_exc()
96
+ print(f'It occurs error when reading {video_ytid}')
97
+ video_tensor = None
98
+ num_frames = 0
99
+
100
+ if video_tensor is not None and os.path.exists(subtitle_path):
101
+ cv2_vr = cv2.VideoCapture(video_path)
102
+ duration = int(cv2_vr.get(cv2.CAP_PROP_FRAME_COUNT))
103
+ fps = cv2_vr.get(cv2.CAP_PROP_FPS)
104
+ selected_frame_ids = get_seq_frames(duration, num_frames)
105
+
106
+ subs = pysubs2.load(subtitle_path, encoding="utf-8")
107
+ subtitles = []
108
+ for seleced_frame_id in selected_frame_ids:
109
+ sub_text = ""
110
+ cur_time = pysubs2.make_time(fps=fps, frames=seleced_frame_id)
111
+ for sub in subs:
112
+ if sub.start < cur_time and sub.end > cur_time:
113
+ sub_text = sub.text.replace("\\N", " ")
114
+ break
115
+ if sub_text.strip():
116
+ subtitles.append(sub_text)
117
+ subtitles = "\n".join(subtitles)
118
+ else:
119
+ subtitles = ""
120
+
121
+ return {
122
+ 'video': video_tensor,
123
+ 'subtitle': subtitles,
124
+ 'record': line,
125
+ }
126
+
127
+
128
+ def collate_fn(batch):
129
+ vid = [x['video'] for x in batch]
130
+ sub = [x['subtitle'] for x in batch]
131
+ rcs = [x['record'] for x in batch]
132
+ return vid, sub, rcs
133
+
134
+
135
+ def load_parquet(parquet_file):
136
+ table = pq.read_table(parquet_file)
137
+
138
+ # Convert PyArrow Table to pandas DataFrame
139
+ df = table.to_pandas()
140
+
141
+ jsons = []
142
+ for record in df.itertuples():
143
+
144
+ if len(jsons) < int(record.video_id):
145
+ jsons.append({
146
+ "video_id": record.video_id,
147
+ "youtube_id": record.videoID,
148
+ "url": record.url,
149
+ "duration": record.duration,
150
+ "domain": record.domain,
151
+ "sub_category": record.sub_category,
152
+ "questions": [
153
+ {
154
+ "question_id": record.question_id,
155
+ "task_type": record.task_type,
156
+ "question": record.question,
157
+ "choices": list(record.options),
158
+ "answer": record.answer,
159
+ }
160
+ ]
161
+ })
162
+ else:
163
+ jsons[-1]['questions'].append({
164
+ "question_id": record.question_id,
165
+ "task_type": record.task_type,
166
+ "question": record.question,
167
+ "choices": list(record.options),
168
+ "answer": record.answer,
169
+ })
170
+
171
+ return jsons
172
+
173
+
174
+ def build_videomme_eval(args, processor):
175
+ # convert parquet to json
176
+ questions = load_parquet(args.question_file)
177
+ # questions = json.load(open(args.question_file, "r"))
178
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
179
+ dataset = VideoMMEDataset(args.video_folder, args.subtitle_folder, questions, processor)
180
+ dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn)
181
+
182
+ return dataloader
183
+
184
+
185
+ def videomme_dump(record, instruct, options, output):
186
+ letters = ['A', 'B', 'C', 'D']
187
+
188
+ digit2word = {
189
+ '1': 'one',
190
+ '2': 'two',
191
+ '3': 'three',
192
+ '4': 'four',
193
+ '5': 'five',
194
+ '6': 'six',
195
+ '7': 'seven',
196
+ '8': 'eight',
197
+ '9': 'nine',
198
+ '0': 'zero',
199
+ }
200
+
201
+ output = output.replace('answer', '')
202
+ output = output.replace('Answer', '')
203
+ pred_answer = re.findall('[\(\ \[]*([A-D])[\)\.\ \]]*', output)
204
+ try:
205
+ find_flag = False
206
+ if len(pred_answer) == 0:
207
+ for idx, opt in enumerate(options):
208
+ # Arabic numerals -> English words
209
+ opt2 = opt
210
+ if opt in digit2word:
211
+ opt2 = digit2word[opt]
212
+ if opt.lower() in output.lower() or opt2.lower() in output.lower():
213
+ pred_idx = idx
214
+ find_flag = True
215
+ break
216
+ else:
217
+ pred_answer = pred_answer[0].strip()
218
+ pred_answer = pred_answer.strip('()')
219
+ pred_idx = letters.index(pred_answer)
220
+ find_flag = True
221
+
222
+ assert find_flag, 'The video \"{}\" instruct: \n\"{}\"\n output: \n\"{}\"\n is not in the expected format'.format(record['youtube_id'], instruct, output)
223
+ except:
224
+ traceback.print_exc()
225
+ pred_idx = 2
226
+
227
+ return letters[pred_idx]
228
+
229
+
230
+ def run_inference(args):
231
+ disable_torch_init()
232
+
233
+ # Initialize the model
234
+ model, processor, tokenizer = model_init(args.model_path)
235
+
236
+ answer_file = os.path.expanduser(args.answer_file)
237
+ answer_sub_file = answer_file.replace('.json', '_sub.json')
238
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
239
+ ans_file = open(answer_file, "w")
240
+ ans_sub_file = open(answer_sub_file, "w")
241
+
242
+ val_loader = build_videomme_eval(args, processor['video'])
243
+
244
+ # Iterate over each sample in the ground truth file
245
+ for i, (videos, subtitles, records) in enumerate(tqdm(val_loader)):
246
+ video_tensor = videos[0]
247
+ subtitle = subtitles[0]
248
+ record = records[0]
249
+
250
+ new_record = copy.deepcopy(record)
251
+ new_record_sub = copy.deepcopy(record)
252
+
253
+ if video_tensor is None:
254
+ new_record['missing'] = True
255
+ ans_file.write(json.dumps(new_record) + ",\n")
256
+ new_record_sub['missing'] = True
257
+ ans_sub_file.write(json.dumps(new_record_sub) + ",\n")
258
+ continue
259
+ else:
260
+ new_record['missing'] = False
261
+ new_record_sub['missing'] = False
262
+
263
+ questions = record['questions']
264
+ for idx, question in enumerate(questions):
265
+ q = question['question']
266
+ choices = question['choices']
267
+ options = [re.findall('[A-D]\. (.*).', c)[0] for c in choices]
268
+
269
+ instruct = "Select the best answer to the following multiple-choice question based on the video. Respond with only the letter (A, B, C, or D) of the correct option.\n"
270
+ instruct += f"{q}\n"
271
+ for cho_idx, cho in enumerate(choices):
272
+ instruct += f"{cho}\n"
273
+ # instruct += "The best option is: "
274
+ instruct += "Answer with the option\'s letter from the given choices directly and only give the best option. The best answer is: "
275
+ output = mm_infer(video_tensor, instruct, model=model, tokenizer=tokenizer, modal='video', do_sample=False)
276
+ new_record['questions'][idx]['response'] = videomme_dump(record, instruct, options, output)
277
+
278
+ instruct = f"This video's subtitles are listed below:\n{subtitle}\n" + instruct
279
+ output = mm_infer(video_tensor, instruct, model=model, tokenizer=tokenizer, modal='video', do_sample=False)
280
+ new_record_sub['questions'][idx]['response'] = videomme_dump(record, instruct, options, output)
281
+
282
+ ans_file.write(json.dumps(new_record) + ",\n")
283
+ ans_sub_file.write(json.dumps(new_record_sub) + ",\n")
284
+
285
+ ans_file.close()
286
+ ans_sub_file.close()
287
+
288
+
289
+ if __name__ == "__main__":
290
+ parser = argparse.ArgumentParser()
291
+
292
+ parser.add_argument('--model-path', help='', required=True)
293
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
294
+ parser.add_argument('--subtitle-folder', help='Directory containing subtitle files.', required=True)
295
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
296
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
297
+ parser.add_argument("--num-chunks", type=int, default=1)
298
+ parser.add_argument("--chunk-idx", type=int, default=0)
299
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
300
+ parser.add_argument("--batch-size", type=int, default=1)
301
+ parser.add_argument("--num-workers", type=int, default=8)
302
+ args = parser.parse_args()
303
+
304
+ run_inference(args)
videollama2/eval/inference_video_oqa_activitynet.py ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import math
4
+ import argparse
5
+ import warnings
6
+ import traceback
7
+ from tqdm import tqdm
8
+
9
+ from torch.utils.data import Dataset, DataLoader
10
+
11
+ import sys
12
+ sys.path.append('./')
13
+ from videollama2 import model_init, mm_infer
14
+ from videollama2.utils import disable_torch_init
15
+
16
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
17
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
18
+
19
+
20
+ def split_list(lst, n):
21
+ """Split a list into n (roughly) equal-sized chunks"""
22
+ chunk_size = math.ceil(len(lst) / n) # integer division
23
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
24
+
25
+
26
+ def get_chunk(lst, n, k):
27
+ chunks = split_list(lst, n)
28
+ return chunks[k]
29
+
30
+
31
+ class ActivitynetDataset(Dataset):
32
+
33
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
34
+
35
+ def __init__(self, questions, answers, processor):
36
+ self.questions = questions
37
+ self.answers = answers
38
+ self.processor = processor
39
+
40
+ def __len__(self):
41
+ return len(self.questions)
42
+
43
+ def __getitem__(self, idx):
44
+ sample = self.questions[idx]
45
+ answer = self.answers[idx]
46
+
47
+ video_name = sample['video_name']
48
+ question = sample['question']
49
+ question_id = sample['question_id']
50
+ answer = answer['answer']
51
+
52
+ for fmt in self.video_formats: # Added this line
53
+ temp_path = os.path.join(args.video_folder, f"v_{video_name}{fmt}")
54
+ if os.path.exists(temp_path):
55
+ video_path = temp_path
56
+ break
57
+ # BUG: compatibility for MSVD, MSRVTT, TGIF
58
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
59
+ if os.path.exists(temp_path):
60
+ video_path = temp_path
61
+ break
62
+
63
+ video_tensor = self.processor(video_path)
64
+
65
+ return {
66
+ 'video': video_tensor,
67
+ 'video_name': video_name,
68
+ 'question': question,
69
+ 'question_id': question_id,
70
+ 'answer': answer,
71
+ }
72
+
73
+
74
+ def collate_fn(batch):
75
+ vid = [x['video'] for x in batch]
76
+ v_id = [x['video_name'] for x in batch]
77
+ qus = [x['question'] for x in batch]
78
+ qid = [x['question_id'] for x in batch]
79
+ ans = [x['answer'] for x in batch]
80
+ return vid, v_id, qus, qid, ans
81
+
82
+
83
+ def run_inference(args):
84
+ disable_torch_init()
85
+
86
+ # Initialize the model
87
+ model, processor, tokenizer = model_init(args.model_path)
88
+
89
+ gt_questions = json.load(open(args.question_file, "r"))
90
+ gt_questions = get_chunk(gt_questions, args.num_chunks, args.chunk_idx)
91
+ gt_answers = json.load(open(args.answer_file, "r"))
92
+ gt_answers = get_chunk(gt_answers, args.num_chunks, args.chunk_idx)
93
+
94
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
95
+ dataset = ActivitynetDataset(gt_questions, gt_answers, processor['video'])
96
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
97
+
98
+ answer_file = os.path.join(args.output_file)
99
+ os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
100
+ ans_file = open(answer_file, "w")
101
+
102
+ # Iterate over each sample in the ground truth file
103
+ for i, (video_tensors, video_names, questions, question_ids, answers) in enumerate(tqdm(dataloader)):
104
+ video_tensor = video_tensors[0]
105
+ video_name = video_names[0]
106
+ question = questions[0]
107
+ question_id = question_ids[0]
108
+ answer = answers[0]
109
+
110
+ # question = question + '\n' + 'Answer the question using a single word or a short phrase with multiple words.'
111
+
112
+ try:
113
+ output = mm_infer(
114
+ video_tensor,
115
+ question,
116
+ model=model,
117
+ tokenizer=tokenizer,
118
+ modal='video',
119
+ do_sample=False,
120
+ )
121
+ except:
122
+ traceback.print_exc()
123
+ output = "error"
124
+
125
+ sample_set = {'id': question_id, 'question': question, 'answer': answer, 'pred': output}
126
+ ans_file.write(json.dumps(sample_set) + "\n")
127
+
128
+ ans_file.close()
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+
134
+ parser.add_argument('--model-path', help='', required=True)
135
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
136
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
137
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
138
+ parser.add_argument('--output-file', help='Directory to save the model results JSON.', required=True)
139
+ parser.add_argument("--num-chunks", type=int, default=1)
140
+ parser.add_argument("--chunk-idx", type=int, default=0)
141
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
142
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
143
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
144
+ args = parser.parse_args()
145
+
146
+ run_inference(args)
videollama2/eval/inference_video_oqa_vcgpt_consistency.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ from torch.utils.data import Dataset, DataLoader
11
+
12
+ import sys
13
+ sys.path.append('./')
14
+ from videollama2 import model_init, mm_infer
15
+ from videollama2.utils import disable_torch_init
16
+
17
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
18
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
19
+
20
+
21
+ def split_list(lst, n):
22
+ """Split a list into n (roughly) equal-sized chunks"""
23
+ chunk_size = math.ceil(len(lst) / n) # integer division
24
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
25
+
26
+
27
+ def get_chunk(lst, n, k):
28
+ chunks = split_list(lst, n)
29
+ return chunks[k]
30
+
31
+
32
+ class VCGPTDataset(Dataset):
33
+
34
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
35
+
36
+ def __init__(self, data_list, processor):
37
+ self.data_list = data_list
38
+ self.processor = processor
39
+
40
+ def __len__(self):
41
+ return len(self.data_list)
42
+
43
+ def __getitem__(self, idx):
44
+ line = self.data_list[idx]
45
+ question1 = line['Q1']
46
+ question2 = line['Q2']
47
+ answer = line['A']
48
+ video_name = line['video_name']
49
+
50
+ for fmt in self.video_formats: # Added this line
51
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
52
+ if os.path.exists(temp_path):
53
+ video_path = temp_path
54
+ break
55
+
56
+ video_tensor = self.processor(video_path)
57
+
58
+ return {
59
+ 'video': video_tensor,
60
+ 'video_name': video_name,
61
+ 'question1': question1,
62
+ 'question2': question2,
63
+ 'answer': answer,
64
+ }
65
+
66
+
67
+ def collate_fn(batch):
68
+ vid = [x['video'] for x in batch]
69
+ v_id = [x['video_name'] for x in batch]
70
+ qus1 = [x['question1'] for x in batch]
71
+ qus2 = [x['question2'] for x in batch]
72
+ ans = [x['answer'] for x in batch]
73
+ vid = torch.stack(vid, dim=0)
74
+ return vid, v_id, qus1, qus2, ans
75
+
76
+
77
+ def run_inference(args):
78
+ disable_torch_init()
79
+
80
+ # Initialize the model
81
+ model, processor, tokenizer = model_init(args.model_path)
82
+
83
+ questions = json.load(open(args.question_file, "r"))
84
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
85
+
86
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
87
+ dataset = VCGPTDataset(questions, processor['video'])
88
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
89
+
90
+ answer_file = os.path.expanduser(args.answer_file)
91
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
92
+ ans_file = open(answer_file, "w")
93
+
94
+ output_list = [] # List to store the output results
95
+
96
+ # Iterate over each sample in the ground truth file
97
+ for i, (video_tensors, video_names, questions1, questions2, answers) in enumerate(tqdm(dataloader)):
98
+
99
+ # reduce batch dimension
100
+ video_tensor = video_tensors[0]
101
+ video_name = video_names[0]
102
+ question1 = questions1[0]
103
+ question2 = questions2[0]
104
+ answer = answers[0]
105
+
106
+ output1 = mm_infer(
107
+ video_tensor,
108
+ question1,
109
+ model=model,
110
+ tokenizer=tokenizer,
111
+ modal='video',
112
+ do_sample=False,
113
+ )
114
+
115
+ output2 = mm_infer(
116
+ video_tensor,
117
+ question2,
118
+ model=model,
119
+ tokenizer=tokenizer,
120
+ do_sample=False,
121
+ modal='video',
122
+ )
123
+
124
+ qa = {'video_name': video_name, 'Q1': question1, 'Q2': question2, 'A': answer, 'P1': output1, 'P2': output2}
125
+
126
+ ans_file.write(json.dumps(qa) + "\n")
127
+
128
+ ans_file.close()
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+
134
+ # Define the command-line arguments
135
+ parser.add_argument('--model-path', help='', required=True)
136
+ parser.add_argument('--model_base', help='', default=None, type=str, required=False)
137
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
138
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
139
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
140
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
141
+ parser.add_argument("--num-chunks", type=int, default=1)
142
+ parser.add_argument("--chunk-idx", type=int, default=0)
143
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
144
+ parser.add_argument("--model_max_length", type=int, required=False, default=2048)
145
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
146
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
147
+
148
+ args = parser.parse_args()
149
+
150
+ run_inference(args)
videollama2/eval/inference_video_oqa_vcgpt_general.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import json
5
+ import argparse
6
+ import warnings
7
+ from tqdm import tqdm
8
+
9
+ import torch
10
+ from torch.utils.data import Dataset, DataLoader
11
+
12
+ import sys
13
+ sys.path.append('./')
14
+ from videollama2 import model_init, mm_infer
15
+ from videollama2.utils import disable_torch_init
16
+
17
+ # NOTE: Ignore TypedStorage warning, which refers to this link~(https://github.com/pytorch/pytorch/issues/97207#issuecomment-1494781560)
18
+ warnings.filterwarnings('ignore', category=UserWarning, message='TypedStorage is deprecated')
19
+
20
+
21
+ def split_list(lst, n):
22
+ """Split a list into n (roughly) equal-sized chunks"""
23
+ chunk_size = math.ceil(len(lst) / n) # integer division
24
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
25
+
26
+
27
+ def get_chunk(lst, n, k):
28
+ chunks = split_list(lst, n)
29
+ return chunks[k]
30
+
31
+
32
+ class VCGPTDataset(Dataset):
33
+
34
+ video_formats = ['.mp4', '.webm', '.avi', '.mov', '.mkv']
35
+
36
+ def __init__(self, data_list, processor):
37
+ self.data_list = data_list
38
+ self.processor = processor
39
+
40
+ def __len__(self):
41
+ return len(self.data_list)
42
+
43
+ def __getitem__(self, idx):
44
+ line = self.data_list[idx]
45
+ question = line['Q']
46
+ answer = line['A']
47
+ video_name = line['video_name']
48
+
49
+ for fmt in self.video_formats: # Added this line
50
+ temp_path = os.path.join(args.video_folder, f"{video_name}{fmt}")
51
+ if os.path.exists(temp_path):
52
+ video_path = temp_path
53
+ break
54
+
55
+ video_tensor = self.processor(video_path)
56
+
57
+ return {
58
+ 'video': video_tensor,
59
+ 'video_name': video_name,
60
+ 'question': question,
61
+ 'answer': answer,
62
+ }
63
+
64
+
65
+ def collate_fn(batch):
66
+ vid = [x['video'] for x in batch]
67
+ v_id = [x['video_name'] for x in batch]
68
+ qus = [x['question'] for x in batch]
69
+ ans = [x['answer'] for x in batch]
70
+ vid = torch.stack(vid, dim=0)
71
+ return vid, v_id, qus, ans
72
+
73
+
74
+ def run_inference(args):
75
+ disable_torch_init()
76
+
77
+ # Initialize the model
78
+ model, processor, tokenizer = model_init(args.model_path)
79
+
80
+ questions = json.load(open(args.question_file, "r"))
81
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
82
+
83
+ assert args.batch_size == 1, "Batch size must be 1 for inference"
84
+ dataset = VCGPTDataset(questions, processor['video'])
85
+ dataloader = DataLoader(dataset, shuffle=False, batch_size=args.batch_size, num_workers=args.num_workers, collate_fn=collate_fn)
86
+
87
+ answer_file = os.path.expanduser(args.answer_file)
88
+ os.makedirs(os.path.dirname(answer_file), exist_ok=True)
89
+ ans_file = open(answer_file, "w")
90
+
91
+ # Iterate over each sample in the ground truth file
92
+ for i, (video_tensors, video_names, questions, answers) in enumerate(tqdm(dataloader)):
93
+
94
+ # reduce batch dimension
95
+ video_tensor = video_tensors[0]
96
+ video_name = video_names[0]
97
+ question = questions[0]
98
+ answer = answers[0]
99
+
100
+ output = mm_infer(
101
+ video_tensor,
102
+ question,
103
+ model=model,
104
+ tokenizer=tokenizer,
105
+ modal='video',
106
+ do_sample=False,
107
+ )
108
+
109
+ qa = {'video_name': video_name, 'Q': question, 'A': answer, 'P': output}
110
+
111
+ ans_file.write(json.dumps(qa) + "\n")
112
+
113
+ ans_file.close()
114
+
115
+
116
+ if __name__ == "__main__":
117
+ parser = argparse.ArgumentParser()
118
+
119
+ parser.add_argument('--model-path', help='', required=True)
120
+ parser.add_argument('--video-folder', help='Directory containing video files.', required=True)
121
+ parser.add_argument('--question-file', help='Path to the ground truth file containing question.', required=True)
122
+ parser.add_argument('--answer-file', help='Path to the ground truth file containing answers.', required=True)
123
+ parser.add_argument("--num-chunks", type=int, default=1)
124
+ parser.add_argument("--chunk-idx", type=int, default=0)
125
+ parser.add_argument("--device", type=str, required=False, default='cuda:0')
126
+ parser.add_argument("--batch-size", type=int, required=False, default=1)
127
+ parser.add_argument("--num-workers", type=int, required=False, default=8)
128
+ args = parser.parse_args()
129
+
130
+ run_inference(args)
videollama2/mm_utils.py ADDED
@@ -0,0 +1,357 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+ import os
3
+ import math
4
+ import base64
5
+ import traceback
6
+ from io import BytesIO
7
+
8
+ import cv2
9
+ import torch
10
+ import imageio
11
+ import numpy as np
12
+ from PIL import Image
13
+ from decord import VideoReader, cpu
14
+ from moviepy.editor import VideoFileClip
15
+ from transformers import StoppingCriteria
16
+
17
+ from .constants import NUM_FRAMES, MAX_FRAMES, NUM_FRAMES_PER_SECOND, MODAL_INDEX_MAP, DEFAULT_IMAGE_TOKEN
18
+
19
+
20
+ def chunk_list(input_list, chunk_size):
21
+ return [input_list[i:i + chunk_size] for i in range(0, len(input_list), chunk_size)]
22
+
23
+
24
+ def load_image_from_base64(image):
25
+ return Image.open(BytesIO(base64.b64decode(image)))
26
+
27
+
28
+ def expand2square(pil_img, background_color):
29
+ width, height = pil_img.size
30
+ if width == height:
31
+ return pil_img
32
+ elif width > height:
33
+ result = Image.new(pil_img.mode, (width, width), background_color)
34
+ result.paste(pil_img, (0, (width - height) // 2))
35
+ return result
36
+ else:
37
+ result = Image.new(pil_img.mode, (height, height), background_color)
38
+ result.paste(pil_img, ((height - width) // 2, 0))
39
+ return result
40
+
41
+
42
+ def create_photo_grid(arr, rows=None, cols=None):
43
+ """
44
+ Create a photo grid from a 4D numpy array with shape [t, h, w, c].
45
+
46
+ Parameters:
47
+ arr (numpy.ndarray): Input array with shape [t, h, w, c].
48
+ rows (int): Optional. Number of rows in the grid. If not set, it will be determined based on `cols` or the square root of `t`.
49
+ cols (int): Optional. Number of columns in the grid. If not set, it will be determined based on `rows` or the square root of `t`.
50
+
51
+ Returns:
52
+ numpy.ndarray: A 3D numpy array representing the photo grid.
53
+ """
54
+
55
+ if isinstance(arr, list):
56
+ if isinstance(arr[0], Image.Image):
57
+ arr = np.stack([np.array(img) for img in arr])
58
+ elif isinstance(arr[0], np.ndarray):
59
+ arr = np.stack(arr)
60
+ else:
61
+ raise ValueError("Invalid input type. Expected list of Images or numpy arrays.")
62
+
63
+ t, h, w, c = arr.shape
64
+
65
+ # Calculate the number of rows and columns if not provided
66
+ if rows is None and cols is None:
67
+ rows = math.ceil(math.sqrt(t))
68
+ cols = math.ceil(t / rows)
69
+ elif rows is None:
70
+ rows = math.ceil(t / cols)
71
+ elif cols is None:
72
+ cols = math.ceil(t / rows)
73
+
74
+ # Check if the grid can hold all the images
75
+ if rows * cols < t:
76
+ raise ValueError(f"Not enough grid cells ({rows}x{cols}) to hold all images ({t}).")
77
+
78
+ # Create the grid array with appropriate height and width
79
+ grid_height = h * rows
80
+ grid_width = w * cols
81
+ grid = np.zeros((grid_height, grid_width, c), dtype=arr.dtype)
82
+
83
+ # Fill the grid with images
84
+ for i in range(t):
85
+ row_idx = i // cols
86
+ col_idx = i % cols
87
+ grid[row_idx*h:(row_idx+1)*h, col_idx*w:(col_idx+1)*w, :] = arr[i]
88
+
89
+ return grid
90
+
91
+
92
+ def process_image(image_path, processor, aspect_ratio='pad'):
93
+ image = Image.open(image_path).convert('RGB')
94
+
95
+ images = [np.array(image)]
96
+
97
+ if aspect_ratio == 'pad':
98
+ images = [Image.fromarray(f) for f in images]
99
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
100
+ else:
101
+ images = [Image.fromarray(f) for f in images]
102
+
103
+ images = processor.preprocess(images, return_tensors='pt')['pixel_values']
104
+ return images
105
+
106
+
107
+ def frame_sample(duration, mode='uniform', num_frames=None, fps=None):
108
+ if mode == 'uniform':
109
+ assert num_frames is not None, "Number of frames must be provided for uniform sampling."
110
+ # NOTE: v1 version
111
+ # Calculate the size of each segment from which a frame will be extracted
112
+ seg_size = float(duration - 1) / num_frames
113
+
114
+ frame_ids = []
115
+ for i in range(num_frames):
116
+ # Calculate the start and end indices of each segment
117
+ start = seg_size * i
118
+ end = seg_size * (i + 1)
119
+ # Append the middle index of the segment to the list
120
+ frame_ids.append((start + end) / 2)
121
+
122
+ return np.round(np.array(frame_ids) + 1e-6).astype(int)
123
+ # NOTE: v0 version
124
+ # return np.linspace(0, duration-1, num_frames, dtype=int)
125
+ elif mode == 'fps':
126
+ assert fps is not None, "FPS must be provided for FPS sampling."
127
+ segment_len = min(fps // NUM_FRAMES_PER_SECOND, duration)
128
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
129
+ else:
130
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
131
+
132
+
133
+ def process_video(video_path, processor, s=None, e=None, aspect_ratio='pad', num_frames=NUM_FRAMES):
134
+ if isinstance(video_path, str):
135
+ if s is not None and e is not None:
136
+ s = s if s >= 0. else 0.
137
+ e = e if e >= 0. else 0.
138
+ if s > e:
139
+ s, e = e, s
140
+ elif s == e:
141
+ e = s + 1
142
+
143
+ # 1. Loading Video
144
+ if os.path.isdir(video_path):
145
+ frame_files = sorted(os.listdir(video_path))
146
+
147
+ fps = 3
148
+ num_frames_of_video = len(frame_files)
149
+ elif video_path.endswith('.gif'):
150
+ gif_reader = imageio.get_reader(video_path)
151
+
152
+ fps = 25
153
+ num_frames_of_video = len(gif_reader)
154
+ else:
155
+ vreader = VideoReader(video_path, ctx=cpu(0), num_threads=1)
156
+
157
+ fps = vreader.get_avg_fps()
158
+ num_frames_of_video = len(vreader)
159
+
160
+ # 2. Determine frame range & Calculate frame indices
161
+ f_start = 0 if s is None else max(int(s * fps) - 1, 0)
162
+ f_end = num_frames_of_video - 1 if e is None else min(int(e * fps) - 1, num_frames_of_video - 1)
163
+ frame_indices = list(range(f_start, f_end + 1))
164
+
165
+ duration = len(frame_indices)
166
+ # 3. Sampling frame indices
167
+ if num_frames is None:
168
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='fps', fps=fps)]
169
+ else:
170
+ sampled_frame_indices = [frame_indices[i] for i in frame_sample(duration, mode='uniform', num_frames=num_frames)]
171
+
172
+ # 4. Acquire frame data
173
+ if os.path.isdir(video_path):
174
+ video_data = [Image.open(os.path.join(video_path, frame_files[f_idx])) for f_idx in sampled_frame_indices]
175
+ elif video_path.endswith('.gif'):
176
+ video_data = [Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)) for idx, frame in enumerate(gif_reader) if idx in sampled_frame_indices]
177
+ else:
178
+ video_data = [Image.fromarray(frame) for frame in vreader.get_batch(sampled_frame_indices).asnumpy()]
179
+
180
+ elif isinstance(video_path, np.ndarray):
181
+ video_data = [Image.fromarray(f) for f in video_path]
182
+ elif isinstance(video_path, list) and isinstance(video_path[0], np.ndarray):
183
+ video_data = [Image.fromarray(f) for f in video_path]
184
+ elif isinstance(video_path, list) and isinstance(video_path[0], str):
185
+ video_data = [Image.open(f) for f in video_path]
186
+ elif isinstance(video_path, list) and isinstance(video_path[0], Image.Image):
187
+ video_data = video_path
188
+ else:
189
+ raise ValueError(f"Unsupported video path type: {type(video_path)}")
190
+
191
+ while num_frames is not None and len(video_data) < num_frames:
192
+ video_data.append(Image.fromarray(np.zeros((*video_data[-1].size, 3), dtype=np.uint8)))
193
+
194
+ # MAX_FRAMES filter
195
+ video_data = video_data[:MAX_FRAMES]
196
+
197
+ if aspect_ratio == 'pad':
198
+ images = [expand2square(f, tuple(int(x*255) for x in processor.image_mean)) for f in video_data]
199
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
200
+ else:
201
+ images = [f for f in video_data]
202
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
203
+ return video
204
+
205
+
206
+ def process_video_old(video_path, processor, aspect_ratio='pad', num_frames=NUM_FRAMES, image_grid=False, sample_scheme='uniform'):
207
+ def frame_sample(duration, mode='uniform', local_fps=None):
208
+ if mode == 'uniform':
209
+ # Calculate the size of each segment from which a frame will be extracted
210
+ seg_size = float(duration - 1) / num_frames
211
+
212
+ frame_ids = []
213
+ for i in range(num_frames):
214
+ # Calculate the start and end indices of each segment
215
+ start = int(np.round(seg_size * i))
216
+ end = int(np.round(seg_size * (i + 1)))
217
+ # Append the middle index of the segment to the list
218
+ frame_ids.append((start + end) // 2)
219
+
220
+ return frame_ids
221
+ # NOTE: old version
222
+ # return np.linspace(0, duration-1, num_frames, dtype=int)
223
+ elif mode == 'fps':
224
+ assert local_fps is not None
225
+ segment_len = min(local_fps // NUM_FRAMES_PER_SECOND, duration)
226
+ return np.arange(segment_len // 2, duration, segment_len, dtype=int)
227
+ else:
228
+ raise ImportError(f'Unsupported frame sampling mode: {mode}')
229
+
230
+ if isinstance(video_path, str):
231
+ if video_path.endswith('.gif'):
232
+ video_gif = imageio.get_reader(video_path)
233
+ duration, local_fps = len(video_gif), 10
234
+
235
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
236
+ # limit the max input frames
237
+ if len(frame_id_list) > MAX_FRAMES:
238
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
239
+ video_data = [frame for index, frame in enumerate(video_gif) if index in frame_id_list]
240
+ # added by lixin4ever, include the support of .webm files from sthsthv2
241
+ elif video_path.endswith('.webm'):
242
+ video_webm = VideoFileClip(video_path)
243
+ video_frames = np.array(list(video_webm.iter_frames()))
244
+
245
+ duration, local_fps = len(video_frames), video_webm.fps
246
+
247
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
248
+ # limit the max input frames
249
+ if len(frame_id_list) > MAX_FRAMES:
250
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
251
+ video_data = video_frames[frame_id_list]
252
+ else:
253
+ # NOTE: num_threads=1 is required to avoid deadlock in multiprocessing
254
+ decord_vr = VideoReader(uri=video_path, ctx=cpu(0), num_threads=1)
255
+ duration, local_fps = len(decord_vr), float(decord_vr.get_avg_fps())
256
+
257
+ frame_id_list = frame_sample(duration, mode=sample_scheme, local_fps=local_fps)
258
+ # limit the max input frames
259
+ if len(frame_id_list) > MAX_FRAMES:
260
+ frame_id_list = np.linspace(0, duration-1, MAX_FRAMES, dtype=int)
261
+ try:
262
+ video_data = decord_vr.get_batch(frame_id_list).numpy()
263
+ except:
264
+ video_data = decord_vr.get_batch(frame_id_list).asnumpy()
265
+
266
+ elif isinstance(video_path, np.ndarray):
267
+ assert len(video_path) == num_frames
268
+ video_data = video_path
269
+ elif isinstance(video_path, list):
270
+ assert len(video_path) == num_frames
271
+ video_data = np.stack([np.array(x) for x in video_path])
272
+
273
+ if image_grid:
274
+ grid_h = grid_w = math.ceil(math.sqrt(num_frames))
275
+ pg = create_photo_grid(video_data, grid_h, grid_w)
276
+ video_data = [pg, *video_data]
277
+
278
+ if aspect_ratio == 'pad':
279
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
280
+ images = [expand2square(image, tuple(int(x*255) for x in processor.image_mean)) for image in images]
281
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
282
+ else:
283
+ images = [Image.fromarray(f.numpy() if isinstance(f, torch.Tensor) else f) for f in video_data]
284
+ video = processor.preprocess(images, return_tensors='pt')['pixel_values']
285
+
286
+ return video
287
+
288
+
289
+ def tokenizer_multimodal_token(prompt, tokenizer, multimodal_token=DEFAULT_IMAGE_TOKEN, return_tensors=None):
290
+ """Tokenize text and multimodal tag to input_ids.
291
+
292
+ Args:
293
+ prompt (str): Text prompt (w/ multimodal tag), e.g., '<video>\nDescribe the video.'
294
+ tokenizer (transformers.PreTrainedTokenizer): Tokenizer object.
295
+ multimodal_token (int): Token index corresponding to the multimodal tag.
296
+ """
297
+ multimodal_token_index = MODAL_INDEX_MAP.get(multimodal_token, None)
298
+ if multimodal_token_index is None:
299
+ input_ids = tokenizer(prompt, add_special_tokens=False).input_ids
300
+ else:
301
+ prompt_chunks = [tokenizer(chunk, add_special_tokens=False).input_ids for idx, chunk in enumerate(prompt.split(multimodal_token))]
302
+
303
+ input_ids = []
304
+ for i in range(1, 2 * len(prompt_chunks)):
305
+ if i % 2 == 1:
306
+ input_ids.extend(prompt_chunks[i // 2])
307
+ else:
308
+ input_ids.append(multimodal_token_index)
309
+
310
+ if return_tensors is not None:
311
+ if return_tensors == 'pt':
312
+ return torch.tensor(input_ids, dtype=torch.long)
313
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
314
+ return input_ids
315
+
316
+
317
+ def get_model_name_from_path(model_path):
318
+ model_path = model_path.strip("/")
319
+ model_paths = model_path.split("/")
320
+ if model_paths[-1].startswith('checkpoint-'):
321
+ return model_paths[-2] + "_" + model_paths[-1]
322
+ else:
323
+ return model_paths[-1]
324
+
325
+
326
+ class KeywordsStoppingCriteria(StoppingCriteria):
327
+ def __init__(self, keywords, tokenizer, input_ids):
328
+ self.keywords = keywords
329
+ self.keyword_ids = []
330
+ self.max_keyword_len = 0
331
+ for keyword in keywords:
332
+ cur_keyword_ids = tokenizer(keyword).input_ids
333
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
334
+ cur_keyword_ids = cur_keyword_ids[1:]
335
+ if len(cur_keyword_ids) > self.max_keyword_len:
336
+ self.max_keyword_len = len(cur_keyword_ids)
337
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
338
+ self.tokenizer = tokenizer
339
+ self.start_len = input_ids.shape[1]
340
+
341
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
342
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
343
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
344
+ for keyword_id in self.keyword_ids:
345
+ if (output_ids[0, -keyword_id.shape[0]:] == keyword_id).all():
346
+ return True
347
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
348
+ for keyword in self.keywords:
349
+ if keyword in outputs:
350
+ return True
351
+ return False
352
+
353
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
354
+ outputs = []
355
+ for i in range(output_ids.shape[0]):
356
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
357
+ return all(outputs)
videollama2/model/__init__.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ import os
18
+ import warnings
19
+ import shutil
20
+ import logging
21
+ import torch
22
+ from transformers import PretrainedConfig, AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
23
+
24
+ from .projector import load_mm_projector
25
+ from .videollama2_llama import Videollama2LlamaForCausalLM, Videollama2LlamaConfig
26
+ from .videollama2_mistral import Videollama2MistralForCausalLM, Videollama2MistralConfig
27
+ from .videollama2_mixtral import Videollama2MixtralForCausalLM, Videollama2MixtralConfig
28
+ from .videollama2_qwen2 import Videollama2Qwen2ForCausalLM, Videollama2Qwen2Config
29
+ from .videollama2_gemma2 import Videollama2Gemma2ForCausalLM, Videollama2Gemma2Config
30
+ from .videollama2_phi3 import Videollama2Phi3ForCausalLM, Videollama2Phi3Config
31
+
32
+
33
+ VLLMs = {
34
+ "videollama2": Videollama2MistralForCausalLM,
35
+ "videollama2_llama": Videollama2LlamaForCausalLM,
36
+ "videollama2_mistral": Videollama2MistralForCausalLM,
37
+ "videollama2_mixtral": Videollama2MixtralForCausalLM,
38
+ "videollama2_qwen2": Videollama2Qwen2ForCausalLM,
39
+ "videollama2_gemma2": Videollama2Gemma2ForCausalLM,
40
+ "videollama2_phi3": Videollama2Phi3ForCausalLM,
41
+ }
42
+
43
+ VLLMConfigs = {
44
+ "videollama2": Videollama2MistralConfig,
45
+ "videollama2_llama": Videollama2LlamaConfig,
46
+ "videollama2_mistral": Videollama2MistralConfig,
47
+ "videollama2_mixtral": Videollama2MixtralConfig,
48
+ "videollama2_qwen2": Videollama2Qwen2Config,
49
+ "videollama2_gemma2": Videollama2Gemma2Config,
50
+ "videollama2_phi3": Videollama2Phi3Config,
51
+ }
52
+
53
+
54
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
55
+ logging.info(f"Loading model from path: {model_path}")
56
+ logging.info(f"Model base: {model_base}, Model name: {model_name}")
57
+ logging.info(f"Device: {device}, Device map: {device_map}")
58
+
59
+ if 'token' in kwargs:
60
+ token = kwargs['token']
61
+ else:
62
+ token = None
63
+
64
+ kwargs = {"device_map": device_map, **kwargs}
65
+
66
+ if device != "cuda":
67
+ kwargs['device_map'] = {"": device}
68
+
69
+ if load_8bit:
70
+ kwargs['load_in_8bit'] = True
71
+ elif load_4bit:
72
+ # NOTE: High-version Transformers will report: """ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time."""
73
+ # kwargs['load_in_4bit'] = True
74
+ kwargs['quantization_config'] = BitsAndBytesConfig(
75
+ load_in_4bit=True,
76
+ bnb_4bit_compute_dtype=torch.float16,
77
+ bnb_4bit_use_double_quant=True,
78
+ bnb_4bit_quant_type='nf4'
79
+ )
80
+ else:
81
+ kwargs['torch_dtype'] = torch.float16
82
+
83
+ if use_flash_attn:
84
+ kwargs['attn_implementation'] = 'flash_attention_2'
85
+
86
+ try:
87
+ config = AutoConfig.from_pretrained(model_path)
88
+ logging.info(f"Model configuration loaded successfully.")
89
+ except Exception as e:
90
+ logging.error(f"Error loading model configuration: {e}")
91
+ raise e
92
+
93
+ # judge model type
94
+ model_type = config.model_type
95
+
96
+ # judge pretrain/finetune
97
+ try:
98
+ is_pretraining = config.tune_mm_mlp_adapter
99
+ except:
100
+ is_pretraining = False
101
+
102
+ # NOTE: lora/qlora model loading
103
+ if 'lora' in model_name.lower() or 'qlora' in model_name.lower():
104
+ logging.info(f"inside lora if")
105
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
106
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
107
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
108
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
109
+
110
+ # NOTE: remove qlora training quantization config
111
+ if hasattr(lora_cfg_pretrained, 'quantization_config'):
112
+ del lora_cfg_pretrained.quantization_config
113
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
114
+ print('Loading VideoLLaMA from base model...')
115
+
116
+ if 'vicuna' in model_base.lower():
117
+ model = Videollama2LlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
118
+ elif 'mistral' in model_base.lower():
119
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
120
+ else:
121
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
122
+
123
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
124
+ if model.lm_head.weight.shape[0] != token_num:
125
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
126
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
127
+
128
+ print('Loading additional VideoLLaMA weights...')
129
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
130
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
131
+ else:
132
+ # this is probably from HF Hub
133
+ from huggingface_hub import hf_hub_download
134
+ def load_from_hf(repo_id, filename, subfolder=None):
135
+ cache_file = hf_hub_download(
136
+ repo_id=repo_id,
137
+ filename=filename,
138
+ subfolder=subfolder)
139
+ return torch.load(cache_file, map_location='cpu')
140
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
141
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
142
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
143
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
144
+ model.load_state_dict(non_lora_trainables, strict=False)
145
+
146
+ from peft import PeftModel
147
+ print('Loading LoRA weights...')
148
+ model = PeftModel.from_pretrained(model, model_path)
149
+ print('Merging LoRA weights...')
150
+ model = model.merge_and_unload()
151
+ print('Model is loaded...')
152
+ elif model_base is not None or '-base' in model_name.lower() or is_pretraining:
153
+ # NOTE: Base/Pretrain model loading
154
+ logging.info(f"inside else if base model")
155
+ print('Loading VideoLLaMA 2 from base model...')
156
+ cfg_pretrained = PretrainedConfig.from_pretrained(model_path, token=token)
157
+ # NOTE: AutoConfig will modify `_name_or_path` property to `model_path` if `model_path` is not None.
158
+ # cfg_pretrained = AutoConfig.from_pretrained(model_path, token=token)
159
+ model_base = model_base if model_base is not None else cfg_pretrained._name_or_path
160
+
161
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False, token=token)
162
+
163
+ if model_type in ['videollama2', 'videollama2_mistral']:
164
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
165
+ elif model_type in ['videollama2_mixtral']:
166
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
167
+ elif model_type in ['videollama2_qwen2']:
168
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
169
+ elif model_type in ['videollama2_gemma2']:
170
+ model = Videollama2Gemma2ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
171
+ elif model_type in ['videollama2_phi3']:
172
+ model = Videollama2Phi3ForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
173
+ else:
174
+ model = Videollama2MistralForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=config, **kwargs)
175
+
176
+ # NOTE; loading vision-language projector
177
+ # * old codes for loading local mm_projector.bin
178
+ # mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
179
+ # mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
180
+ # model.load_state_dict(mm_projector_weights, strict=False)
181
+ # * new codes which supports loading mm_projector.bin both offline and online
182
+ mm_projector_weights = load_mm_projector(model_path, token=token)
183
+ model.load_state_dict(mm_projector_weights, strict=False)
184
+ elif 'videollama2' in model_type:
185
+ # NOTE: SFT model loading
186
+ logging.info(f"inside AutoTokenizer else if")
187
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False, token=token)
188
+
189
+ if model_type in ['videollama2', 'videollama2_mistral']:
190
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
191
+ elif model_type in ['videollama2_mixtral']:
192
+ logging.info(f"Loading videollama2_mixtral")
193
+ logging.info(f"Config: {config}")
194
+ model = Videollama2MixtralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
195
+ elif model_type in ['videollama2_qwen2']:
196
+ model = Videollama2Qwen2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
197
+ elif model_type in ['videollama2_gemma2']:
198
+ model = Videollama2Gemma2ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
199
+ elif model_type in ['videollama2_phi3']:
200
+ model = Videollama2Phi3ForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
201
+ else:
202
+ model = Videollama2MistralForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, config=config, **kwargs)
203
+ else:
204
+ logging.info(f"inside else")
205
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, token=token)
206
+ model = AutoModelForCausalLM.from_pretrained(model_path, config=config, **kwargs)
207
+
208
+ processor = None
209
+
210
+ if "videollama" in model_type:
211
+ vision_tower = model.get_vision_tower()
212
+ if not vision_tower.is_loaded:
213
+ vision_tower.load_model()
214
+ vision_tower.to(device=device, dtype=torch.float16)
215
+ # NOTE: videollama2 adopts the same processor for processing image and video.
216
+ processor = vision_tower.image_processor
217
+
218
+ if hasattr(model.config, "max_sequence_length"):
219
+ context_len = model.config.max_sequence_length
220
+ else:
221
+ context_len = 2048
222
+ logging.info(f"Model: {model}")
223
+ logging.info(f"context_len: {context_len}")
224
+ return tokenizer, model, processor, context_len
videollama2/model/encoder.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+
6
+ from transformers import (
7
+ CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig,
8
+ SiglipVisionModel, SiglipImageProcessor, SiglipVisionConfig
9
+ )
10
+
11
+
12
+ class CLIPVisionTower(nn.Module):
13
+
14
+ def __init__(self, vision_tower, args, delay_load=False):
15
+ super().__init__()
16
+
17
+ self.is_loaded = False
18
+
19
+ self.vision_tower_name = vision_tower
20
+ self.select_layer = args.mm_vision_select_layer
21
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
22
+
23
+ if not delay_load:
24
+ self.load_model()
25
+ else:
26
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
27
+
28
+ def load_model(self):
29
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
30
+
31
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
32
+ self.vision_tower.requires_grad_(False)
33
+
34
+ self.is_loaded = True
35
+
36
+ def feature_select(self, image_forward_outs):
37
+ image_features = image_forward_outs.hidden_states[self.select_layer]
38
+ if self.select_feature == 'patch':
39
+ image_features = image_features[:, 1:]
40
+ elif self.select_feature == 'cls_patch':
41
+ image_features = image_features
42
+ else:
43
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
44
+ return image_features
45
+
46
+ @torch.no_grad()
47
+ def forward(self, images):
48
+ if type(images) is list:
49
+ image_features = []
50
+ for image in images:
51
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
52
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
53
+ image_features.append(image_feature)
54
+ else:
55
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
56
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
57
+
58
+ return image_features
59
+
60
+ @property
61
+ def dummy_feature(self):
62
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
63
+
64
+ @property
65
+ def dtype(self):
66
+ return self.vision_tower.dtype
67
+
68
+ @property
69
+ def device(self):
70
+ return self.vision_tower.device
71
+
72
+ @property
73
+ def config(self):
74
+ if self.is_loaded:
75
+ return self.vision_tower.config
76
+ else:
77
+ return self.cfg_only
78
+
79
+ @property
80
+ def hidden_size(self):
81
+ return self.config.hidden_size
82
+
83
+ @property
84
+ def num_patches(self):
85
+ return (self.config.image_size // self.config.patch_size) ** 2
86
+
87
+ @property
88
+ def num_patches_per_side(self):
89
+ return self.config.image_size // self.config.patch_size
90
+
91
+ @property
92
+ def image_size(self):
93
+ return self.config.image_size
94
+
95
+
96
+ class SiglipVisionTower(nn.Module):
97
+
98
+ def __init__(self, vision_tower, args, delay_load=False):
99
+ super().__init__()
100
+
101
+ self.is_loaded = False
102
+
103
+ self.vision_tower_name = vision_tower
104
+ self.select_layer = args.mm_vision_select_layer
105
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
106
+
107
+ if not delay_load:
108
+ self.load_model()
109
+ else:
110
+ self.cfg_only = SiglipVisionConfig.from_pretrained(self.vision_tower_name)
111
+
112
+ def load_model(self):
113
+ self.image_processor = SiglipImageProcessor.from_pretrained(self.vision_tower_name)
114
+
115
+ self.vision_tower = SiglipVisionModel.from_pretrained(self.vision_tower_name)
116
+ self.vision_tower.requires_grad_(False)
117
+
118
+ self.is_loaded = True
119
+
120
+ def feature_select(self, image_forward_outs):
121
+ image_features = image_forward_outs.hidden_states[self.select_layer]
122
+ if self.select_feature == 'patch':
123
+ image_features = image_features
124
+ else:
125
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
126
+ return image_features
127
+
128
+ @torch.no_grad()
129
+ def forward(self, images):
130
+ if type(images) is list:
131
+ image_features = []
132
+ for image in images:
133
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
134
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
135
+ image_features.append(image_feature)
136
+ else:
137
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
138
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
139
+
140
+ return image_features
141
+
142
+ @property
143
+ def dummy_feature(self):
144
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
145
+
146
+ @property
147
+ def dtype(self):
148
+ return self.vision_tower.dtype
149
+
150
+ @property
151
+ def device(self):
152
+ return self.vision_tower.device
153
+
154
+ @property
155
+ def config(self):
156
+ if self.is_loaded:
157
+ return self.vision_tower.config
158
+ else:
159
+ return self.cfg_only
160
+
161
+ @property
162
+ def hidden_size(self):
163
+ return self.config.hidden_size
164
+
165
+ @property
166
+ def num_patches(self):
167
+ return (self.config.image_size // self.config.patch_size) ** 2
168
+
169
+ @property
170
+ def num_patches_per_side(self):
171
+ return self.config.image_size // self.config.patch_size
172
+
173
+ @property
174
+ def image_size(self):
175
+ return self.config.image_size
176
+
177
+
178
+ def build_vision_tower(vision_tower_cfg, **kwargs):
179
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
180
+
181
+ if 'clip' in vision_tower:
182
+ vision_tower = CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
183
+ elif 'siglip' in vision_tower:
184
+ vision_tower = SiglipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
185
+ else:
186
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
187
+
188
+ return vision_tower
videollama2/model/projector.py ADDED
@@ -0,0 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2024 Alibaba DAMO Academy
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import os
16
+ import re
17
+
18
+ import einops
19
+ import torch
20
+ import torch.nn as nn
21
+ import torch.nn.functional as F
22
+ from timm.models.regnet import RegStage
23
+ from timm.models.layers import LayerNorm, LayerNorm2d
24
+ from transformers import TRANSFORMERS_CACHE
25
+
26
+
27
+ def parse_snapshot_folder(repo_id, cache_dir=None, repo_type="model"):
28
+ revision = "main"
29
+ # 1. parse the downloaded cache folder
30
+ if cache_dir is None:
31
+ cache_dir = TRANSFORMERS_CACHE
32
+ else:
33
+ cache_dir = cache_dir
34
+ object_id = repo_id.replace("/", "--")
35
+ repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
36
+ # 2. resolve refs (for instance to convert main to the associated commit sha)
37
+ refs_dir = os.path.join(repo_cache, "refs")
38
+ if os.path.isdir(refs_dir):
39
+ revision_file = os.path.join(refs_dir, revision)
40
+ if os.path.isfile(revision_file):
41
+ with open(revision_file) as f:
42
+ revision = f.read()
43
+ # 3. acquire the snapshot folder
44
+ folder = os.path.join(repo_cache, "snapshots", revision)
45
+
46
+ return folder
47
+
48
+
49
+ def load_mm_projector(model_path, cache_dir=None, token=None):
50
+ if os.path.exists(os.path.join(model_path, 'mm_projector.bin')):
51
+ is_local = True
52
+ folder = model_path
53
+ else:
54
+ is_local = False
55
+ folder = parse_snapshot_folder(model_path, cache_dir=cache_dir, repo_type="model")
56
+ if not os.path.exists(os.path.join(folder, 'mm_projector.bin')):
57
+ # downloading from remote repo
58
+ from huggingface_hub import snapshot_download
59
+ snapshot_download(repo_id=model_path, cache_dir=cache_dir, token=token)
60
+
61
+ mm_projector_weights = torch.load(os.path.join(folder, 'mm_projector.bin'), map_location='cpu')
62
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
63
+ return mm_projector_weights
64
+
65
+
66
+ class IdentityMap(nn.Module):
67
+
68
+ def __init__(self):
69
+ super().__init__()
70
+
71
+ def forward(self, x, *args, **kwargs):
72
+ return x
73
+
74
+ @property
75
+ def config(self):
76
+ return {"mm_projector_type": 'identity'}
77
+
78
+
79
+ class SimpleResBlock(nn.Module):
80
+
81
+ def __init__(self, channels):
82
+ super().__init__()
83
+ self.pre_norm = nn.LayerNorm(channels)
84
+
85
+ self.proj = nn.Sequential(
86
+ nn.Linear(channels, channels),
87
+ nn.GELU(),
88
+ nn.Linear(channels, channels)
89
+ )
90
+ def forward(self, x):
91
+ x = self.pre_norm(x)
92
+ return x + self.proj(x)
93
+
94
+
95
+ def build_vision_projector(config, delay_load=False, **kwargs):
96
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
97
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
98
+ if mlp_gelu_match:
99
+ mlp_depth = int(mlp_gelu_match.group(1))
100
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
101
+ for _ in range(1, mlp_depth):
102
+ modules.append(nn.GELU())
103
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
104
+ return nn.Sequential(*modules)
105
+
106
+ if projector_type == "linear":
107
+ # NOTE: for both linear and mlp2x_gelu projector type, mean pooling is adopted to aggreate video features
108
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
109
+ elif projector_type == "stc_connector":
110
+ return STCConnector(config)
111
+ elif projector_type == "stp_connector":
112
+ return STPConnector(config)
113
+ elif projector_type == "stc_connector_v35":
114
+ return STCConnectorV35(config)
115
+ elif projector_type == "spatial_conv":
116
+ return SpatialConv(config)
117
+ elif projector_type == "spatial_pool":
118
+ return SpatialPool(config)
119
+ if projector_type == 'identity':
120
+ return IdentityMap()
121
+
122
+ raise ValueError(f'Unknown projector type: {projector_type}')
123
+
124
+
125
+ def build_mlp(depth, hidden_size, output_hidden_size):
126
+ modules = [nn.Linear(hidden_size, output_hidden_size)]
127
+ for _ in range(1, depth):
128
+ modules.append(nn.GELU())
129
+ modules.append(nn.Linear(output_hidden_size, output_hidden_size))
130
+ return nn.Sequential(*modules)
131
+
132
+
133
+ class STCConnector(nn.Module):
134
+
135
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
136
+ """Temporal Convolutional Vision-Language Connector.
137
+
138
+ Args:
139
+ config: config object.
140
+ downsample: (temporal, height, width) downsample rate.
141
+ depth: depth of the spatial interaction blocks.
142
+ mlp_depth: depth of the vision-language projector layers.
143
+ """
144
+ super().__init__()
145
+ self.encoder_hidden_size = encoder_hidden_size = config.mm_hidden_size
146
+ self.hidden_size = hidden_size = config.hidden_size
147
+ self.output_hidden_size = output_hidden_size = config.hidden_size
148
+ # TODO: make these as config arguments
149
+ self.depth = depth
150
+ self.mlp_depth = mlp_depth
151
+ self.downsample = downsample
152
+ if depth != 0:
153
+ self.s1 = RegStage(
154
+ depth=depth,
155
+ in_chs=encoder_hidden_size,
156
+ out_chs=hidden_size,
157
+ stride=1,
158
+ dilation=1,
159
+ act_layer=nn.SiLU,
160
+ norm_layer=LayerNorm2d,
161
+ )
162
+ else:
163
+ self.s1 = nn.Identity()
164
+ self.sampler = nn.Sequential(
165
+ nn.Conv3d(
166
+ in_channels=hidden_size,
167
+ out_channels=hidden_size,
168
+ kernel_size=downsample,
169
+ stride=downsample,
170
+ padding=1,
171
+ bias=True
172
+ ),
173
+ nn.SiLU()
174
+ )
175
+ if depth != 0:
176
+ self.s2 = RegStage(
177
+ depth=depth,
178
+ in_chs=hidden_size,
179
+ out_chs=hidden_size,
180
+ stride=1,
181
+ dilation=1,
182
+ act_layer=nn.SiLU,
183
+ norm_layer=LayerNorm2d,
184
+ )
185
+ else:
186
+ self.s2 = nn.Identity()
187
+ self.readout = build_mlp(mlp_depth, hidden_size, output_hidden_size)
188
+
189
+ def forward(self, x):
190
+ """Aggregate tokens on the temporal and spatial dimensions.
191
+ Args:
192
+ x: input tokens [b, t, h, w, d] / [b, t, l, d]
193
+ Returns:
194
+ aggregated tokens [b, l, d]
195
+ """
196
+ t = x.size(1)
197
+ if x.ndim == 4:
198
+ hw = int(x.size(2) ** 0.5)
199
+ x = einops.rearrange(x, "b t (h w) d -> b d t h w", h=hw, w=hw)
200
+ elif x.ndim == 5:
201
+ x = einops.rearrange(x, "b t h w d -> b d t h w")
202
+
203
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
204
+ # 1. the first stage of the adapter
205
+ x = self.s1(x)
206
+ x = einops.rearrange(x, "(b t) d h w -> b d t h w", t=t)
207
+ # 2. downsampler
208
+ x = self.sampler(x)
209
+ new_t = x.size(2)
210
+ # 3. the second stage of the adapter
211
+ x = einops.rearrange(x, "b d t h w -> (b t) d h w")
212
+ x = self.s2(x)
213
+ x = einops.rearrange(x, "(b t) d h w -> b (t h w) d", t=new_t)
214
+ x = self.readout(x)
215
+ return x
216
+
217
+
218
+ class STPConnector(STCConnector):
219
+
220
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
221
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
222
+ self.sampler = nn.Sequential(nn.AvgPool3d(downsample), nn.SiLU())
223
+
224
+
225
+ class STCConnectorV35(STCConnector):
226
+
227
+ def __init__(self, config, downsample=(2, 2, 2), depth=4, mlp_depth=2):
228
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
229
+ self.sampler = nn.Sequential(
230
+ nn.Conv3d(
231
+ in_channels=self.hidden_size,
232
+ out_channels=self.hidden_size,
233
+ kernel_size=downsample,
234
+ stride=downsample,
235
+ padding=0,
236
+ bias=True
237
+ ),
238
+ nn.SiLU())
239
+
240
+
241
+ class SpatialConv(STCConnector):
242
+
243
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
244
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
245
+
246
+
247
+ class SpatialPool(STPConnector):
248
+
249
+ def __init__(self, config, downsample=(1, 2, 2), depth=0, mlp_depth=2):
250
+ super().__init__(config=config, downsample=downsample, depth=depth, mlp_depth=mlp_depth)
videollama2/model/videollama2_arch.py ADDED
@@ -0,0 +1,264 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import os
17
+ from abc import ABC, abstractmethod
18
+
19
+ import einops
20
+ import torch
21
+ import torch.nn as nn
22
+
23
+ from .projector import load_mm_projector, build_vision_projector
24
+ from .encoder import build_vision_tower
25
+ from ..constants import IGNORE_INDEX, NUM_FRAMES, MODAL_INDEX_MAP
26
+
27
+
28
+ class Videollama2MetaModel:
29
+
30
+ def __init__(self, config):
31
+ super(Videollama2MetaModel, self).__init__(config)
32
+
33
+ if hasattr(config, "mm_vision_tower"):
34
+ self.vision_tower = build_vision_tower(config, delay_load=True)
35
+ self.mm_projector = build_vision_projector(config)
36
+
37
+ def get_vision_tower(self):
38
+ vision_tower = getattr(self, 'vision_tower', None)
39
+ if type(vision_tower) is list:
40
+ vision_tower = vision_tower[0]
41
+ return vision_tower
42
+
43
+ def initialize_vision_modules(self, model_args, fsdp=None):
44
+ vision_tower = model_args.vision_tower
45
+ mm_vision_select_layer = model_args.mm_vision_select_layer
46
+ mm_vision_select_feature = model_args.mm_vision_select_feature
47
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
48
+
49
+ self.config.mm_vision_tower = vision_tower
50
+
51
+ if self.get_vision_tower() is None:
52
+ vision_tower = build_vision_tower(model_args)
53
+
54
+ if fsdp is not None and len(fsdp) > 0:
55
+ self.vision_tower = [vision_tower]
56
+ else:
57
+ self.vision_tower = vision_tower
58
+ else:
59
+ if fsdp is not None and len(fsdp) > 0:
60
+ vision_tower = self.vision_tower[0]
61
+ else:
62
+ vision_tower = self.vision_tower
63
+ vision_tower.load_model()
64
+
65
+ self.config.use_mm_proj = True
66
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
67
+ self.config.mm_hidden_size = vision_tower.hidden_size
68
+ self.config.mm_vision_select_layer = mm_vision_select_layer
69
+ self.config.mm_vision_select_feature = mm_vision_select_feature
70
+
71
+ if getattr(self, 'mm_projector', None) is None:
72
+ self.mm_projector = build_vision_projector(self.config)
73
+ else:
74
+ # In case it is frozen by LoRA
75
+ for p in self.mm_projector.parameters():
76
+ p.requires_grad = True
77
+
78
+ if pretrain_mm_mlp_adapter is not None:
79
+ if os.path.exists(pretrain_mm_mlp_adapter):
80
+ is_local = True
81
+ if os.path.isdir(pretrain_mm_mlp_adapter):
82
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
83
+ else:
84
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
85
+ else:
86
+ # Support loading projector weights from remote HuggingFace model hub
87
+ is_local = False
88
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.replace('mm_projector.bin', '')
89
+ pretrain_mm_mlp_adapter = pretrain_mm_mlp_adapter.strip('/').strip('\\').strip()
90
+ mm_projector_weights = load_mm_projector(pretrain_mm_mlp_adapter)
91
+
92
+ def get_w(weights, keyword):
93
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
94
+
95
+ # self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
96
+ # set strict=False to avoid missing key error regarding bert.embeddings.position_ids
97
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'), strict=False)
98
+
99
+
100
+ class Videollama2MetaForCausalLM(ABC):
101
+
102
+ @abstractmethod
103
+ def get_model(self):
104
+ pass
105
+
106
+ def num_frames(self):
107
+ if hasattr(self.config, 'num_frames'):
108
+ return self.config.num_frames
109
+ else:
110
+ return NUM_FRAMES
111
+
112
+ def get_vision_tower(self):
113
+ return self.get_model().get_vision_tower()
114
+
115
+ def encode_images_or_videos(self, images):
116
+ num_frames = self.config.num_frames if hasattr(self.config, 'num_frames') else NUM_FRAMES
117
+
118
+ data_batch = []
119
+ for i, (data, modal) in enumerate(images):
120
+ if modal == 'image':
121
+ data = data.expand(num_frames, -1, -1, -1)
122
+ else:
123
+ data = data
124
+ data_batch.append(data)
125
+
126
+ data_batch = torch.stack(data_batch, dim=0)
127
+
128
+ assert len(data_batch.size()) == 5
129
+ batch_size = data_batch.size(0)
130
+
131
+ frames = einops.rearrange(data_batch, 'b t c h w -> (b t) c h w')
132
+ frames_features = self.get_model().get_vision_tower()(frames)
133
+ frames_features = einops.rearrange(frames_features, '(b t) n h -> b t n h', b = batch_size)
134
+
135
+ return self.temporal_aggregator(frames_features)
136
+
137
+ def temporal_aggregator(self, frames_features):
138
+ """Temporal aggregation of frame features.
139
+ Args:
140
+ frames_features (torch.Tensor): Frame features with shape (b, t, n, h).
141
+ Returns:
142
+ torch.Tensor: Video features with shape (b, n, h).
143
+ """
144
+ # TODO: improve the merging method.
145
+ # *********** mean pooling *************
146
+ if self.config.mm_projector_type == "mlp2x_gelu" or self.config.mm_projector_type == "linear":
147
+ video_features = self.get_model().mm_projector(frames_features.mean(1))
148
+ # *********** spatial convolution *************
149
+ elif self.config.mm_projector_type == "spatial_conv":
150
+ video_features = self.get_model().mm_projector(frames_features)
151
+ # *********** spatial pooling *************
152
+ elif self.config.mm_projector_type == "spatial_pool":
153
+ video_features = self.get_model().mm_projector(frames_features)
154
+ # *********** time ************
155
+ elif "tc_connector" in self.config.mm_projector_type or "tp_connector" in self.config.mm_projector_type:
156
+ video_features = self.get_model().mm_projector(frames_features)
157
+ else:
158
+ raise Exception(f"Unsupported projector type {self.config.mm_projector_type}!!!")
159
+
160
+ return video_features
161
+
162
+ def prepare_inputs_labels_for_multimodal(
163
+ self, input_ids, attention_mask, past_key_values, labels, images
164
+ ):
165
+ vision_tower = self.get_vision_tower()
166
+ # NOTE: text-only situation
167
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
168
+ # if past_key_values is not None and vision_tower is not None and Xs is not None and input_ids.shape[1] == 1:
169
+ # attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
170
+ return input_ids, attention_mask, past_key_values, None, labels
171
+
172
+ mm_features = self.encode_images_or_videos(images)
173
+
174
+ new_input_embeds = []
175
+ new_labels = [] if labels is not None else None
176
+ cur_mm_idx = 0
177
+ # replace image/video/audio tokens with pre-computed embeddings
178
+ for batch_idx, cur_input_ids in enumerate(input_ids):
179
+ num_multimodals = sum((cur_input_ids == mm_token_idx).sum() for mm_token_idx in MODAL_INDEX_MAP.values())
180
+ # pure text input
181
+ if num_multimodals == 0:
182
+ half_len = cur_input_ids.shape[0] // 2
183
+ cur_mm_features = mm_features[cur_mm_idx]
184
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids[:half_len])
185
+ cur_input_embeds_2 = self.get_model().embed_tokens(cur_input_ids[half_len:])
186
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_mm_features[0:0], cur_input_embeds_2], dim=0)
187
+ new_input_embeds.append(cur_input_embeds)
188
+ if labels is not None:
189
+ new_labels.append(labels[batch_idx])
190
+ cur_mm_idx += 1
191
+ continue
192
+
193
+ cur_new_input_embeds = []
194
+ if labels is not None:
195
+ cur_labels = labels[batch_idx]
196
+ cur_new_labels = []
197
+ assert cur_labels.shape == cur_input_ids.shape
198
+
199
+ mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
200
+ while mm_token_indices.numel() > 0:
201
+ cur_mm_features = mm_features[cur_mm_idx]
202
+ mm_token_start = mm_token_indices[0]
203
+
204
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:mm_token_start]))
205
+ cur_new_input_embeds.append(cur_mm_features)
206
+ if labels is not None:
207
+ cur_new_labels.append(cur_labels[:mm_token_start])
208
+ cur_new_labels.append(torch.full((cur_mm_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
209
+ cur_labels = cur_labels[mm_token_start+1:]
210
+
211
+ cur_mm_idx += 1
212
+ cur_input_ids = cur_input_ids[mm_token_start+1:]
213
+ mm_token_indices = torch.where(sum([cur_input_ids == mm_token_idx for mm_token_idx in MODAL_INDEX_MAP.values()]))[0]
214
+
215
+ if cur_input_ids.numel() > 0:
216
+ cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
217
+ if labels is not None:
218
+ cur_new_labels.append(cur_labels)
219
+ cur_new_input_embeds = [x.to(device=self.device) for x in cur_new_input_embeds]
220
+ # NOTE: one cur_new_input_embeds per each
221
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
222
+ new_input_embeds.append(cur_new_input_embeds)
223
+ if labels is not None:
224
+ cur_new_labels = torch.cat(cur_new_labels, dim=0)
225
+ new_labels.append(cur_new_labels)
226
+
227
+ # padding
228
+ if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
229
+ max_len = max(x.shape[0] for x in new_input_embeds)
230
+
231
+ new_input_embeds_align = []
232
+ for cur_new_embed in new_input_embeds:
233
+ cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
234
+ new_input_embeds_align.append(cur_new_embed)
235
+ new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
236
+
237
+ if labels is not None:
238
+ new_labels_align = []
239
+ _new_labels = new_labels
240
+ for cur_new_label in new_labels:
241
+ cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
242
+ new_labels_align.append(cur_new_label)
243
+ new_labels = torch.stack(new_labels_align, dim=0)
244
+
245
+ if attention_mask is not None:
246
+ new_attention_mask = []
247
+ for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
248
+ new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
249
+ new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
250
+ cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
251
+ new_attention_mask.append(cur_new_attention_mask)
252
+ attention_mask = torch.stack(new_attention_mask, dim=0)
253
+ assert attention_mask.shape == new_labels.shape
254
+ else:
255
+ new_input_embeds = torch.stack(new_input_embeds, dim=0)
256
+ if labels is not None:
257
+ new_labels = torch.stack(new_labels, dim=0)
258
+
259
+ if attention_mask is not None:
260
+ new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
261
+ attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
262
+ assert attention_mask.shape == new_input_embeds.shape[:2]
263
+
264
+ return None, attention_mask, past_key_values, new_input_embeds, new_labels
videollama2/model/videollama2_gemma2.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, \
24
+ Gemma2Config, Gemma2Model, Gemma2ForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2Gemma2Config(Gemma2Config):
33
+ model_type = "videollama2_gemma2"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_gemma2"
38
+
39
+
40
+ class Videollama2Gemma2Model(Videollama2MetaModel, Gemma2Model):
41
+ config_class = Videollama2Gemma2Config
42
+
43
+ def __init__(self, config: Gemma2Config):
44
+ super(Videollama2Gemma2Model, self).__init__(config)
45
+
46
+
47
+ class Videollama2Gemma2ForCausalLM(Gemma2ForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2Gemma2Config
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(Gemma2ForCausalLM, self).__init__(config)
52
+ self.model = Videollama2Gemma2Model(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ cache_position: Optional[int] = None,
77
+ **kwargs
78
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
79
+
80
+ if inputs_embeds is None:
81
+ (
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ inputs_embeds,
86
+ labels
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ attention_mask,
90
+ past_key_values,
91
+ labels,
92
+ images
93
+ )
94
+
95
+ outputs = super().forward(
96
+ input_ids=input_ids,
97
+ attention_mask=attention_mask,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ labels=labels,
101
+ use_cache=use_cache,
102
+ output_attentions=output_attentions,
103
+ output_hidden_states=output_hidden_states,
104
+ return_dict=return_dict,
105
+ cache_position=cache_position,
106
+ )
107
+
108
+ outputs.labels = labels
109
+
110
+ return outputs
111
+
112
+ @torch.no_grad()
113
+ def generate(
114
+ self,
115
+ inputs: Optional[torch.Tensor] = None,
116
+ images: Optional[torch.Tensor] = None,
117
+ **kwargs,
118
+ ) -> Union[GenerateOutput, torch.LongTensor]:
119
+ position_ids = kwargs.pop("position_ids", None)
120
+ attention_mask = kwargs.pop("attention_mask", None)
121
+ if "inputs_embeds" in kwargs:
122
+ raise NotImplementedError("`inputs_embeds` is not supported")
123
+
124
+ if images is not None:
125
+ (
126
+ input_ids,
127
+ attention_mask,
128
+ past_key_values,
129
+ inputs_embeds,
130
+ _
131
+ ) = self.prepare_inputs_labels_for_multimodal(
132
+ input_ids=inputs,
133
+ attention_mask=attention_mask,
134
+ past_key_values=None,
135
+ labels=None,
136
+ images=images
137
+ )
138
+ else:
139
+ inputs_embeds = self.get_model().embed_tokens(inputs)
140
+
141
+ return super().generate(
142
+ position_ids=position_ids,
143
+ attention_mask=attention_mask,
144
+ inputs_embeds=inputs_embeds,
145
+ **kwargs
146
+ )
147
+
148
+ def _prepare_generated_length(self, model_input_name, inputs_tensor, **kwargs):
149
+ if model_input_name == "inputs_embeds":
150
+ self.inputs_embeds_length = inputs_tensor.size(1)
151
+ else:
152
+ self.inputs_embeds_length = 0
153
+ return super()._prepare_generated_length(
154
+ model_input_name=model_input_name,
155
+ inputs_tensor=inputs_tensor,
156
+ **kwargs)
157
+
158
+ def _get_cache(self, cache_implementation: str, max_batch_size: int, max_cache_len: int, **kwargs):
159
+ return super()._get_cache(
160
+ cache_implementation=cache_implementation,
161
+ max_batch_size=max_batch_size,
162
+ max_cache_len=max_cache_len + self.inputs_embeds_length,
163
+ **kwargs)
164
+
165
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
166
+ images = kwargs.pop("images", None)
167
+ _inputs = super().prepare_inputs_for_generation(
168
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
169
+ )
170
+ if images is not None:
171
+ _inputs['images'] = images
172
+ return _inputs
173
+
174
+
175
+ AutoConfig.register("videollama2_gemma2", Videollama2Gemma2Config)
176
+ AutoModelForCausalLM.register(Videollama2Gemma2Config, Videollama2Gemma2ForCausalLM)
videollama2/model/videollama2_llama.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ LlamaConfig, LlamaModel, LlamaForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2LlamaConfig(LlamaConfig):
31
+ model_type = "videollama2_llama"
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.model_type = "videollama2_llama"
36
+
37
+
38
+ class Videollama2LlamaModel(Videollama2MetaModel, LlamaModel):
39
+ config_class = Videollama2LlamaConfig
40
+
41
+ def __init__(self, config: LlamaConfig):
42
+ super(Videollama2LlamaModel, self).__init__(config)
43
+
44
+
45
+ class Videollama2LlamaForCausalLM(LlamaForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2LlamaConfig
47
+
48
+ def __init__(self, config, **kwargs):
49
+ super(LlamaForCausalLM, self).__init__(config)
50
+ self.model = Videollama2LlamaModel(config)
51
+ self.pretraining_tp = config.pretraining_tp
52
+ self.vocab_size = config.vocab_size
53
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
54
+
55
+ # Initialize weights and apply final processing
56
+ self.post_init()
57
+
58
+ def get_model(self):
59
+ return self.model
60
+
61
+ def forward(
62
+ self,
63
+ input_ids: torch.LongTensor = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ position_ids: Optional[torch.LongTensor] = None,
66
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
67
+ inputs_embeds: Optional[torch.FloatTensor] = None,
68
+ labels: Optional[torch.LongTensor] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ images: Optional[torch.FloatTensor] = None,
73
+ return_dict: Optional[bool] = None,
74
+ cache_position: Optional[torch.LongTensor] = None,
75
+ **kwargs
76
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
77
+
78
+ if inputs_embeds is None:
79
+ (
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ inputs_embeds,
84
+ labels
85
+ ) = self.prepare_inputs_labels_for_multimodal(
86
+ input_ids,
87
+ attention_mask,
88
+ past_key_values,
89
+ labels,
90
+ images
91
+ )
92
+
93
+ outputs = super().forward(
94
+ input_ids=input_ids,
95
+ attention_mask=attention_mask,
96
+ past_key_values=past_key_values,
97
+ inputs_embeds=inputs_embeds,
98
+ labels=labels,
99
+ use_cache=use_cache,
100
+ output_attentions=output_attentions,
101
+ output_hidden_states=output_hidden_states,
102
+ return_dict=return_dict,
103
+ cache_position=cache_position,
104
+ )
105
+
106
+ outputs.labels = labels
107
+
108
+ return outputs
109
+
110
+ @torch.no_grad()
111
+ def generate(
112
+ self,
113
+ inputs: Optional[torch.Tensor] = None,
114
+ images: Optional[torch.Tensor] = None,
115
+ **kwargs,
116
+ ) -> Union[GenerateOutput, torch.LongTensor]:
117
+ position_ids = kwargs.pop("position_ids", None)
118
+ attention_mask = kwargs.pop("attention_mask", None)
119
+ if "inputs_embeds" in kwargs:
120
+ raise NotImplementedError("`inputs_embeds` is not supported")
121
+
122
+ if images is not None:
123
+ (
124
+ input_ids,
125
+ attention_mask,
126
+ past_key_values,
127
+ inputs_embeds,
128
+ _
129
+ ) = self.prepare_inputs_labels_for_multimodal(
130
+ input_ids=inputs,
131
+ attention_mask=attention_mask,
132
+ past_key_values=None,
133
+ labels=None,
134
+ images=images
135
+ )
136
+ else:
137
+ inputs_embeds = self.get_model().embed_tokens(inputs)
138
+
139
+ return super().generate(
140
+ position_ids=position_ids,
141
+ attention_mask=attention_mask,
142
+ inputs_embeds=inputs_embeds,
143
+ **kwargs
144
+ )
145
+
146
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
147
+ images = kwargs.pop("images", None)
148
+ _inputs = super().prepare_inputs_for_generation(
149
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
+ )
151
+ if images is not None:
152
+ _inputs['images'] = images
153
+ return _inputs
154
+
155
+
156
+ AutoConfig.register("videollama2_llama", Videollama2LlamaConfig)
157
+ AutoModelForCausalLM.register(Videollama2LlamaConfig, Videollama2LlamaForCausalLM)
videollama2/model/videollama2_mistral.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
24
+ MistralConfig, MistralModel, MistralForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2MistralConfig(MistralConfig):
33
+ model_type = "videollama2_mistral"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_mistral"
38
+
39
+
40
+ class Videollama2MistralModel(Videollama2MetaModel, MistralModel):
41
+ config_class = Videollama2MistralConfig
42
+
43
+ def __init__(self, config: MistralConfig):
44
+ super(Videollama2MistralModel, self).__init__(config)
45
+
46
+
47
+ class Videollama2MistralForCausalLM(MistralForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2MistralConfig
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(MistralForCausalLM, self).__init__(config)
52
+ self.model = Videollama2MistralModel(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ cache_position: Optional[int] = None,
77
+ **kwargs
78
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
79
+
80
+ if inputs_embeds is None:
81
+ (
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ inputs_embeds,
86
+ labels
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ attention_mask,
90
+ past_key_values,
91
+ labels,
92
+ images
93
+ )
94
+
95
+ outputs = super().forward(
96
+ input_ids=input_ids,
97
+ attention_mask=attention_mask,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ labels=labels,
101
+ use_cache=use_cache,
102
+ output_attentions=output_attentions,
103
+ output_hidden_states=output_hidden_states,
104
+ return_dict=return_dict,
105
+ cache_position=cache_position,
106
+ )
107
+
108
+ outputs.labels = labels
109
+
110
+ return outputs
111
+
112
+ @torch.no_grad()
113
+ def generate(
114
+ self,
115
+ inputs: Optional[torch.Tensor] = None,
116
+ images: Optional[torch.Tensor] = None,
117
+ **kwargs,
118
+ ) -> Union[GenerateOutput, torch.LongTensor]:
119
+ position_ids = kwargs.pop("position_ids", None)
120
+ attention_mask = kwargs.pop("attention_mask", None)
121
+ if "inputs_embeds" in kwargs:
122
+ raise NotImplementedError("`inputs_embeds` is not supported")
123
+
124
+ if images is not None:
125
+ (
126
+ input_ids,
127
+ attention_mask,
128
+ past_key_values,
129
+ inputs_embeds,
130
+ _
131
+ ) = self.prepare_inputs_labels_for_multimodal(
132
+ input_ids=inputs,
133
+ attention_mask=attention_mask,
134
+ past_key_values=None,
135
+ labels=None,
136
+ images=images
137
+ )
138
+ else:
139
+ inputs_embeds = self.get_model().embed_tokens(inputs)
140
+
141
+ return super().generate(
142
+ position_ids=position_ids,
143
+ attention_mask=attention_mask,
144
+ inputs_embeds=inputs_embeds,
145
+ **kwargs
146
+ )
147
+
148
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
149
+ images = kwargs.pop("images", None)
150
+ _inputs = super().prepare_inputs_for_generation(
151
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
152
+ )
153
+ if images is not None:
154
+ _inputs['images'] = images
155
+ return _inputs
156
+
157
+
158
+ AutoConfig.register("videollama2_mistral", Videollama2MistralConfig)
159
+ AutoModelForCausalLM.register(Videollama2MistralConfig, Videollama2MistralForCausalLM)
videollama2/model/videollama2_mixtral.py ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ MixtralConfig, MixtralModel, MixtralForCausalLM
24
+
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+ from transformers.generation.utils import GenerateOutput
27
+
28
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
29
+
30
+
31
+ class Videollama2MixtralConfig(MixtralConfig):
32
+ model_type = "videollama2_mixtral"
33
+
34
+ def __init__(self, **kwargs):
35
+ super().__init__(**kwargs)
36
+ self.model_type = "videollama2_mixtral"
37
+
38
+
39
+ class Videollama2MixtralModel(Videollama2MetaModel, MixtralModel):
40
+ config_class = Videollama2MixtralConfig
41
+
42
+ def __init__(self, config: MixtralConfig):
43
+ super(Videollama2MixtralModel, self).__init__(config)
44
+
45
+
46
+ class Videollama2MixtralForCausalLM(MixtralForCausalLM, Videollama2MetaForCausalLM):
47
+ config_class = Videollama2MixtralConfig
48
+
49
+ def __init__(self, config, **kwargs):
50
+ super(MixtralForCausalLM, self).__init__(config)
51
+ self.model = Videollama2MixtralModel(config)
52
+ # self.pretraining_tp = config.pretraining_tp
53
+ self.vocab_size = config.vocab_size
54
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
55
+
56
+ # Initialize weights and apply final processing
57
+ self.post_init()
58
+
59
+ def get_model(self):
60
+ return self.model
61
+
62
+ def forward(
63
+ self,
64
+ input_ids: torch.LongTensor = None,
65
+ attention_mask: Optional[torch.Tensor] = None,
66
+ position_ids: Optional[torch.LongTensor] = None,
67
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
68
+ inputs_embeds: Optional[torch.FloatTensor] = None,
69
+ labels: Optional[torch.LongTensor] = None,
70
+ use_cache: Optional[bool] = None,
71
+ output_attentions: Optional[bool] = None,
72
+ output_hidden_states: Optional[bool] = None,
73
+ images: Optional[torch.FloatTensor] = None,
74
+ return_dict: Optional[bool] = None,
75
+ cache_position: Optional[int] = None,
76
+ **kwargs
77
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
78
+
79
+ if inputs_embeds is None:
80
+ (
81
+ input_ids,
82
+ attention_mask,
83
+ past_key_values,
84
+ inputs_embeds,
85
+ labels
86
+ ) = self.prepare_inputs_labels_for_multimodal(
87
+ input_ids,
88
+ attention_mask,
89
+ past_key_values,
90
+ labels,
91
+ images
92
+ )
93
+
94
+ return super().forward(
95
+ input_ids=input_ids,
96
+ attention_mask=attention_mask,
97
+ past_key_values=past_key_values,
98
+ inputs_embeds=inputs_embeds,
99
+ labels=labels,
100
+ use_cache=use_cache,
101
+ output_attentions=output_attentions,
102
+ output_hidden_states=output_hidden_states,
103
+ return_dict=return_dict,
104
+ cache_position=cache_position,
105
+ )
106
+
107
+ @torch.no_grad()
108
+ def generate(
109
+ self,
110
+ inputs: Optional[torch.Tensor] = None,
111
+ images: Optional[torch.Tensor] = None,
112
+ **kwargs,
113
+ ) -> Union[GenerateOutput, torch.LongTensor]:
114
+ position_ids = kwargs.pop("position_ids", None)
115
+ attention_mask = kwargs.pop("attention_mask", None)
116
+ if "inputs_embeds" in kwargs:
117
+ raise NotImplementedError("`inputs_embeds` is not supported")
118
+
119
+ if images is not None:
120
+ (
121
+ input_ids,
122
+ attention_mask,
123
+ past_key_values,
124
+ inputs_embeds,
125
+ _
126
+ ) = self.prepare_inputs_labels_for_multimodal(
127
+ input_ids=inputs,
128
+ attention_mask=attention_mask,
129
+ past_key_values=None,
130
+ labels=None,
131
+ images=images
132
+ )
133
+ else:
134
+ inputs_embeds = self.get_model().embed_tokens(inputs)
135
+
136
+ return super().generate(
137
+ position_ids=position_ids,
138
+ attention_mask=attention_mask,
139
+ inputs_embeds=inputs_embeds,
140
+ **kwargs
141
+ )
142
+
143
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
144
+ images = kwargs.pop("images", None)
145
+ _inputs = super().prepare_inputs_for_generation(
146
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
147
+ )
148
+ if images is not None:
149
+ _inputs['images'] = images
150
+ return _inputs
151
+
152
+
153
+ AutoConfig.register("videollama2_mixtral", Videollama2MixtralConfig)
154
+ AutoModelForCausalLM.register(Videollama2MixtralConfig, Videollama2MixtralForCausalLM)
videollama2/model/videollama2_phi3.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+ from torch.nn import CrossEntropyLoss
22
+
23
+ from transformers import AutoConfig, AutoModelForCausalLM, PretrainedConfig, \
24
+ Phi3Config, Phi3Model, Phi3ForCausalLM
25
+
26
+ from transformers.modeling_outputs import CausalLMOutputWithPast
27
+ from transformers.generation.utils import GenerateOutput
28
+
29
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
30
+
31
+
32
+ class Videollama2Phi3Config(Phi3Config):
33
+ model_type = "videollama2_phi3"
34
+
35
+ def __init__(self, **kwargs):
36
+ super().__init__(**kwargs)
37
+ self.model_type = "videollama2_phi3"
38
+
39
+
40
+ class Videollama2Phi3Model(Videollama2MetaModel, Phi3Model):
41
+ config_class = Videollama2Phi3Config
42
+
43
+ def __init__(self, config: Phi3Config):
44
+ super(Videollama2Phi3Model, self).__init__(config)
45
+
46
+
47
+ class Videollama2Phi3ForCausalLM(Phi3ForCausalLM, Videollama2MetaForCausalLM):
48
+ config_class = Videollama2Phi3Config
49
+
50
+ def __init__(self, config, **kwargs):
51
+ super(Phi3ForCausalLM, self).__init__(config)
52
+ self.model = Videollama2Phi3Model(config)
53
+ # self.pretraining_tp = config.pretraining_tp
54
+ self.vocab_size = config.vocab_size
55
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
56
+
57
+ # Initialize weights and apply final processing
58
+ self.post_init()
59
+
60
+ def get_model(self):
61
+ return self.model
62
+
63
+ def forward(
64
+ self,
65
+ input_ids: torch.LongTensor = None,
66
+ attention_mask: Optional[torch.Tensor] = None,
67
+ position_ids: Optional[torch.LongTensor] = None,
68
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
69
+ inputs_embeds: Optional[torch.FloatTensor] = None,
70
+ labels: Optional[torch.LongTensor] = None,
71
+ use_cache: Optional[bool] = None,
72
+ output_attentions: Optional[bool] = None,
73
+ output_hidden_states: Optional[bool] = None,
74
+ images: Optional[torch.FloatTensor] = None,
75
+ return_dict: Optional[bool] = None,
76
+ cache_position: Optional[int] = None,
77
+ **kwargs
78
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
79
+
80
+ if inputs_embeds is None:
81
+ (
82
+ input_ids,
83
+ attention_mask,
84
+ past_key_values,
85
+ inputs_embeds,
86
+ labels
87
+ ) = self.prepare_inputs_labels_for_multimodal(
88
+ input_ids,
89
+ attention_mask,
90
+ past_key_values,
91
+ labels,
92
+ images
93
+ )
94
+
95
+ outputs = super().forward(
96
+ input_ids=input_ids,
97
+ attention_mask=attention_mask,
98
+ past_key_values=past_key_values,
99
+ inputs_embeds=inputs_embeds,
100
+ labels=labels,
101
+ use_cache=use_cache,
102
+ output_attentions=output_attentions,
103
+ output_hidden_states=output_hidden_states,
104
+ return_dict=return_dict,
105
+ cache_position=cache_position,
106
+ )
107
+
108
+ outputs.labels = labels
109
+
110
+ return outputs
111
+
112
+ @torch.no_grad()
113
+ def generate(
114
+ self,
115
+ inputs: Optional[torch.Tensor] = None,
116
+ images: Optional[torch.Tensor] = None,
117
+ **kwargs,
118
+ ) -> Union[GenerateOutput, torch.LongTensor]:
119
+ position_ids = kwargs.pop("position_ids", None)
120
+ attention_mask = kwargs.pop("attention_mask", None)
121
+ if "inputs_embeds" in kwargs:
122
+ raise NotImplementedError("`inputs_embeds` is not supported")
123
+
124
+ if images is not None:
125
+ (
126
+ input_ids,
127
+ attention_mask,
128
+ past_key_values,
129
+ inputs_embeds,
130
+ _
131
+ ) = self.prepare_inputs_labels_for_multimodal(
132
+ input_ids=inputs,
133
+ attention_mask=attention_mask,
134
+ past_key_values=None,
135
+ labels=None,
136
+ images=images
137
+ )
138
+ else:
139
+ inputs_embeds = self.get_model().embed_tokens(inputs)
140
+
141
+ return super().generate(
142
+ position_ids=position_ids,
143
+ attention_mask=attention_mask,
144
+ inputs_embeds=inputs_embeds,
145
+ **kwargs
146
+ )
147
+
148
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
149
+ images = kwargs.pop("images", None)
150
+ _inputs = super().prepare_inputs_for_generation(
151
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
152
+ )
153
+ if images is not None:
154
+ _inputs['images'] = images
155
+ return _inputs
156
+
157
+
158
+ AutoConfig.register("videollama2_phi3", Videollama2Phi3Config)
159
+ AutoModelForCausalLM.register(Videollama2Phi3Config, Videollama2Phi3ForCausalLM)
videollama2/model/videollama2_qwen2.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from: https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Copyright 2023 Haotian Liu
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+
17
+ from typing import List, Optional, Tuple, Union
18
+
19
+ import torch
20
+ import torch.nn as nn
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ Qwen2Config, Qwen2Model, Qwen2ForCausalLM
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from .videollama2_arch import Videollama2MetaModel, Videollama2MetaForCausalLM
28
+
29
+
30
+ class Videollama2Qwen2Config(Qwen2Config):
31
+ model_type = "videollama2_qwen2"
32
+
33
+ def __init__(self, **kwargs):
34
+ super().__init__(**kwargs)
35
+ self.model_type = "videollama2_qwen2"
36
+
37
+
38
+ class Videollama2Qwen2Model(Videollama2MetaModel, Qwen2Model):
39
+ config_class = Videollama2Qwen2Config
40
+
41
+ def __init__(self, config: Videollama2Qwen2Config):
42
+ super(Videollama2Qwen2Model, self).__init__(config)
43
+
44
+
45
+ class Videollama2Qwen2ForCausalLM(Qwen2ForCausalLM, Videollama2MetaForCausalLM):
46
+ config_class = Videollama2Qwen2Config
47
+
48
+ def __init__(self, config, **kwargs):
49
+ super(Qwen2ForCausalLM, self).__init__(config)
50
+ self.model = Videollama2Qwen2Model(config)
51
+ # self.pretraining_tp = config.pretraining_tp
52
+ self.vocab_size = config.vocab_size
53
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
54
+
55
+ # Initialize weights and apply final processing
56
+ self.post_init()
57
+
58
+ def get_model(self):
59
+ return self.model
60
+
61
+ def forward(
62
+ self,
63
+ input_ids: torch.LongTensor = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ position_ids: Optional[torch.LongTensor] = None,
66
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
67
+ inputs_embeds: Optional[torch.FloatTensor] = None,
68
+ labels: Optional[torch.LongTensor] = None,
69
+ use_cache: Optional[bool] = None,
70
+ output_attentions: Optional[bool] = None,
71
+ output_hidden_states: Optional[bool] = None,
72
+ images: Optional[torch.FloatTensor] = None,
73
+ return_dict: Optional[bool] = None,
74
+ cache_position: Optional[int] = None,
75
+ **kwargs
76
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
77
+
78
+ if inputs_embeds is None:
79
+ (
80
+ input_ids,
81
+ attention_mask,
82
+ past_key_values,
83
+ inputs_embeds,
84
+ labels
85
+ ) = self.prepare_inputs_labels_for_multimodal(
86
+ input_ids,
87
+ attention_mask,
88
+ past_key_values,
89
+ labels,
90
+ images
91
+ )
92
+
93
+ return super().forward(
94
+ input_ids=input_ids,
95
+ attention_mask=attention_mask,
96
+ past_key_values=past_key_values,
97
+ inputs_embeds=inputs_embeds,
98
+ labels=labels,
99
+ use_cache=use_cache,
100
+ output_attentions=output_attentions,
101
+ output_hidden_states=output_hidden_states,
102
+ return_dict=return_dict,
103
+ cache_position=cache_position,
104
+ )
105
+
106
+ @torch.no_grad()
107
+ def generate(
108
+ self,
109
+ inputs: Optional[torch.Tensor] = None,
110
+ images: Optional[torch.Tensor] = None,
111
+ **kwargs,
112
+ ) -> Union[GenerateOutput, torch.LongTensor]:
113
+ position_ids = kwargs.pop("position_ids", None)
114
+ attention_mask = kwargs.pop("attention_mask", None)
115
+ if "inputs_embeds" in kwargs:
116
+ raise NotImplementedError("`inputs_embeds` is not supported")
117
+
118
+ if images is not None:
119
+ (
120
+ input_ids,
121
+ attention_mask,
122
+ past_key_values,
123
+ inputs_embeds,
124
+ _
125
+ ) = self.prepare_inputs_labels_for_multimodal(
126
+ input_ids=inputs,
127
+ attention_mask=attention_mask,
128
+ past_key_values=None,
129
+ labels=None,
130
+ images=images
131
+ )
132
+ else:
133
+ inputs_embeds = self.get_model().embed_tokens(inputs)
134
+
135
+ return super().generate(
136
+ position_ids=position_ids,
137
+ attention_mask=attention_mask,
138
+ inputs_embeds=inputs_embeds,
139
+ **kwargs
140
+ )
141
+
142
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
143
+ images = kwargs.pop("images", None)
144
+ _inputs = super().prepare_inputs_for_generation(
145
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
146
+ )
147
+ if images is not None:
148
+ _inputs['images'] = images
149
+ return _inputs
150
+
151
+
152
+ AutoConfig.register("videollama2_qwen2", Videollama2Qwen2Config)
153
+ AutoModelForCausalLM.register(Videollama2Qwen2Config, Videollama2Qwen2ForCausalLM)
videollama2/serve/cli.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+
4
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, NUM_FRAMES
5
+ from videollama2.conversation import conv_templates, SeparatorStyle
6
+ from videollama2.model.builder import load_pretrained_model
7
+ from videollama2.utils import disable_torch_init
8
+ from videollama2.mm_utils import process_images, tokenizer_image_token, get_model_name_from_path, tokenizer_MMODAL_token
9
+
10
+ from PIL import Image
11
+ from decord import VideoReader, cpu
12
+
13
+ import requests
14
+ from io import BytesIO
15
+ from transformers import TextStreamer
16
+
17
+
18
+ def load_image(image_file):
19
+ if image_file.startswith('http://') or image_file.startswith('https://'):
20
+ response = requests.get(image_file)
21
+ image = Image.open(BytesIO(response.content)).convert('RGB')
22
+ else:
23
+ image = Image.open(image_file).convert('RGB')
24
+ return image
25
+
26
+ def load_video(video_file):
27
+ decord_vr = VideoReader(uri=video_file, ctx=cpu(0))
28
+ duration = len(decord_vr)
29
+ frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
30
+ video = decord_vr.get_batch(frame_id_list)
31
+ return video
32
+
33
+ def load_image_or_video(image_or_video_file):
34
+ if file_path.endswith(('.jpg', '.jpeg', '.png', '.bmp')):
35
+ return load_image(image_file=image_or_video_file)
36
+ elif file_path.endswith(('.mp4', '.avi', '.mov')):
37
+ return load_video(video_file=image_or_video_file)
38
+ else:
39
+ raise Exception(f"File type of {image_or_video_file} not supported!!!")
40
+
41
+
42
+ def main(args):
43
+ # Model
44
+ disable_torch_init()
45
+
46
+ model_name = get_model_name_from_path(args.model_path)
47
+ tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, args.load_8bit, args.load_4bit, device=args.device)
48
+
49
+ # if "llama-2" in model_name.lower():
50
+ # conv_mode = "llava_llama2"
51
+ # elif "mistral" in model_name.lower():
52
+ # conv_mode = "mistral"
53
+ # elif "v1.6-34b" in model_name.lower():
54
+ # conv_mode = "chatml_direct"
55
+ # elif "v1" in model_name.lower():
56
+ # conv_mode = "llava_v1"
57
+ # else:
58
+ # conv_mode = "llava_v0"
59
+ conv_mode = "llava_v1" # fix conversation mode for now
60
+
61
+ if args.conv_mode is not None and conv_mode != args.conv_mode:
62
+ print('[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}'.format(conv_mode, args.conv_mode, args.conv_mode))
63
+ else:
64
+ args.conv_mode = conv_mode
65
+
66
+ conv = conv_templates[args.conv_mode].copy()
67
+ roles = conv.roles
68
+
69
+ image = load_image(args.image_file)
70
+ image_size = image.size
71
+ # Similar operation in model_worker.py
72
+ image_tensor = process_images([image], image_processor, model.config)
73
+ if type(image_tensor) is list:
74
+ image_tensor = [image.to(model.device, dtype=torch.float16) for image in image_tensor]
75
+ else:
76
+ image_tensor = image_tensor.to(model.device, dtype=torch.float16)
77
+
78
+ while True:
79
+ try:
80
+ inp = input(f"{roles[0]}: ")
81
+ except EOFError:
82
+ inp = ""
83
+ if not inp:
84
+ print("exit...")
85
+ break
86
+
87
+ print(f"{roles[1]}: ", end="")
88
+
89
+ if image is not None:
90
+ # first message
91
+ if model.config.mm_use_im_start_end:
92
+ inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + inp
93
+ else:
94
+ inp = DEFAULT_IMAGE_TOKEN + '\n' + inp
95
+ conv.append_message(conv.roles[0], inp)
96
+ image = None
97
+ else:
98
+ # later messages
99
+ conv.append_message(conv.roles[0], inp)
100
+ conv.append_message(conv.roles[1], None)
101
+ prompt = conv.get_prompt()
102
+
103
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(model.device)
104
+ stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
105
+ keywords = [stop_str]
106
+ streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
107
+
108
+ with torch.inference_mode():
109
+ output_ids = model.generate(
110
+ input_ids,
111
+ images=image_tensor,
112
+ image_sizes=[image_size],
113
+ do_sample=True if args.temperature > 0 else False,
114
+ temperature=args.temperature,
115
+ max_new_tokens=args.max_new_tokens,
116
+ streamer=streamer,
117
+ use_cache=True)
118
+
119
+ outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
120
+ conv.messages[-1][-1] = outputs
121
+
122
+ if args.debug:
123
+ print("\n", {"prompt": prompt, "outputs": outputs}, "\n")
124
+
125
+
126
+ if __name__ == "__main__":
127
+ parser = argparse.ArgumentParser()
128
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
129
+ parser.add_argument("--model-base", type=str, default=None)
130
+ parser.add_argument("--image-file", type=str, required=True)
131
+ parser.add_argument("--device", type=str, default="cuda")
132
+ parser.add_argument("--conv-mode", type=str, default=None)
133
+ parser.add_argument("--temperature", type=float, default=0.2)
134
+ parser.add_argument("--max-new-tokens", type=int, default=512)
135
+ parser.add_argument("--load-8bit", action="store_true")
136
+ parser.add_argument("--load-4bit", action="store_true")
137
+ parser.add_argument("--debug", action="store_true")
138
+ args = parser.parse_args()
139
+ main(args)
videollama2/serve/controller.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A controller manages distributed workers.
3
+ It sends worker addresses to clients.
4
+ """
5
+ import argparse
6
+ import asyncio
7
+ import dataclasses
8
+ from enum import Enum, auto
9
+ import json
10
+ import logging
11
+ import time
12
+ from typing import List, Union
13
+ import threading
14
+
15
+ from fastapi import FastAPI, Request
16
+ from fastapi.responses import StreamingResponse
17
+ import numpy as np
18
+ import requests
19
+ import uvicorn
20
+
21
+ from videollama2.constants import CONTROLLER_HEART_BEAT_EXPIRATION
22
+ from videollama2.utils import build_logger, server_error_msg
23
+
24
+
25
+ logger = build_logger("controller", "controller.log")
26
+
27
+
28
+ class DispatchMethod(Enum):
29
+ LOTTERY = auto()
30
+ SHORTEST_QUEUE = auto()
31
+
32
+ @classmethod
33
+ def from_str(cls, name):
34
+ if name == "lottery":
35
+ return cls.LOTTERY
36
+ elif name == "shortest_queue":
37
+ return cls.SHORTEST_QUEUE
38
+ else:
39
+ raise ValueError(f"Invalid dispatch method")
40
+
41
+
42
+ @dataclasses.dataclass
43
+ class WorkerInfo:
44
+ model_names: List[str]
45
+ speed: int
46
+ queue_length: int
47
+ check_heart_beat: bool
48
+ last_heart_beat: str
49
+
50
+
51
+ def heart_beat_controller(controller):
52
+ while True:
53
+ time.sleep(CONTROLLER_HEART_BEAT_EXPIRATION)
54
+ controller.remove_stable_workers_by_expiration()
55
+
56
+
57
+ class Controller:
58
+ def __init__(self, dispatch_method: str):
59
+ # Dict[str -> WorkerInfo]
60
+ self.worker_info = {}
61
+ self.dispatch_method = DispatchMethod.from_str(dispatch_method)
62
+
63
+ self.heart_beat_thread = threading.Thread(
64
+ target=heart_beat_controller, args=(self,), daemon=True)
65
+ self.heart_beat_thread.start()
66
+
67
+ logger.info("Init controller")
68
+
69
+ def register_worker(self, worker_name: str, check_heart_beat: bool,
70
+ worker_status: dict):
71
+ if worker_name not in self.worker_info:
72
+ logger.info(f"Register a new worker: {worker_name}")
73
+ else:
74
+ logger.info(f"Register an existing worker: {worker_name}")
75
+
76
+ if not worker_status:
77
+ worker_status = self.get_worker_status(worker_name)
78
+ if not worker_status:
79
+ return False
80
+
81
+ self.worker_info[worker_name] = WorkerInfo(
82
+ worker_status["model_names"], worker_status["speed"], worker_status["queue_length"],
83
+ check_heart_beat, time.time())
84
+
85
+ logger.info(f"Register done: {worker_name}, {worker_status}")
86
+ return True
87
+
88
+ def get_worker_status(self, worker_name: str):
89
+ try:
90
+ r = requests.post(worker_name + "/worker_get_status", timeout=5)
91
+ except requests.exceptions.RequestException as e:
92
+ logger.error(f"Get status fails: {worker_name}, {e}")
93
+ return None
94
+
95
+ if r.status_code != 200:
96
+ logger.error(f"Get status fails: {worker_name}, {r}")
97
+ return None
98
+
99
+ return r.json()
100
+
101
+ def remove_worker(self, worker_name: str):
102
+ del self.worker_info[worker_name]
103
+
104
+ def refresh_all_workers(self):
105
+ old_info = dict(self.worker_info)
106
+ self.worker_info = {}
107
+
108
+ for w_name, w_info in old_info.items():
109
+ if not self.register_worker(w_name, w_info.check_heart_beat, None):
110
+ logger.info(f"Remove stale worker: {w_name}")
111
+
112
+ def list_models(self):
113
+ model_names = set()
114
+
115
+ for w_name, w_info in self.worker_info.items():
116
+ model_names.update(w_info.model_names)
117
+
118
+ return list(model_names)
119
+
120
+ def get_worker_address(self, model_name: str):
121
+ if self.dispatch_method == DispatchMethod.LOTTERY:
122
+ worker_names = []
123
+ worker_speeds = []
124
+ for w_name, w_info in self.worker_info.items():
125
+ if model_name in w_info.model_names:
126
+ worker_names.append(w_name)
127
+ worker_speeds.append(w_info.speed)
128
+ worker_speeds = np.array(worker_speeds, dtype=np.float32)
129
+ norm = np.sum(worker_speeds)
130
+ if norm < 1e-4:
131
+ return ""
132
+ worker_speeds = worker_speeds / norm
133
+ if True: # Directly return address
134
+ pt = np.random.choice(np.arange(len(worker_names)),
135
+ p=worker_speeds)
136
+ worker_name = worker_names[pt]
137
+ return worker_name
138
+
139
+ # Check status before returning
140
+ while True:
141
+ pt = np.random.choice(np.arange(len(worker_names)),
142
+ p=worker_speeds)
143
+ worker_name = worker_names[pt]
144
+
145
+ if self.get_worker_status(worker_name):
146
+ break
147
+ else:
148
+ self.remove_worker(worker_name)
149
+ worker_speeds[pt] = 0
150
+ norm = np.sum(worker_speeds)
151
+ if norm < 1e-4:
152
+ return ""
153
+ worker_speeds = worker_speeds / norm
154
+ continue
155
+ return worker_name
156
+ elif self.dispatch_method == DispatchMethod.SHORTEST_QUEUE:
157
+ worker_names = []
158
+ worker_qlen = []
159
+ for w_name, w_info in self.worker_info.items():
160
+ if model_name in w_info.model_names:
161
+ worker_names.append(w_name)
162
+ worker_qlen.append(w_info.queue_length / w_info.speed)
163
+ if len(worker_names) == 0:
164
+ return ""
165
+ min_index = np.argmin(worker_qlen)
166
+ w_name = worker_names[min_index]
167
+ self.worker_info[w_name].queue_length += 1
168
+ logger.info(f"names: {worker_names}, queue_lens: {worker_qlen}, ret: {w_name}")
169
+ return w_name
170
+ else:
171
+ raise ValueError(f"Invalid dispatch method: {self.dispatch_method}")
172
+
173
+ def receive_heart_beat(self, worker_name: str, queue_length: int):
174
+ if worker_name not in self.worker_info:
175
+ logger.info(f"Receive unknown heart beat. {worker_name}")
176
+ return False
177
+
178
+ self.worker_info[worker_name].queue_length = queue_length
179
+ self.worker_info[worker_name].last_heart_beat = time.time()
180
+ logger.info(f"Receive heart beat. {worker_name}")
181
+ return True
182
+
183
+ def remove_stable_workers_by_expiration(self):
184
+ expire = time.time() - CONTROLLER_HEART_BEAT_EXPIRATION
185
+ to_delete = []
186
+ for worker_name, w_info in self.worker_info.items():
187
+ if w_info.check_heart_beat and w_info.last_heart_beat < expire:
188
+ to_delete.append(worker_name)
189
+
190
+ for worker_name in to_delete:
191
+ self.remove_worker(worker_name)
192
+
193
+ def worker_api_generate_stream(self, params):
194
+ worker_addr = self.get_worker_address(params["model"])
195
+ if not worker_addr:
196
+ logger.info(f"no worker: {params['model']}")
197
+ ret = {
198
+ "text": server_error_msg,
199
+ "error_code": 2,
200
+ }
201
+ yield json.dumps(ret).encode() + b"\0"
202
+
203
+ try:
204
+ response = requests.post(worker_addr + "/worker_generate_stream",
205
+ json=params, stream=True, timeout=5)
206
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
207
+ if chunk:
208
+ yield chunk + b"\0"
209
+ except requests.exceptions.RequestException as e:
210
+ logger.info(f"worker timeout: {worker_addr}")
211
+ ret = {
212
+ "text": server_error_msg,
213
+ "error_code": 3,
214
+ }
215
+ yield json.dumps(ret).encode() + b"\0"
216
+
217
+
218
+ # Let the controller act as a worker to achieve hierarchical
219
+ # management. This can be used to connect isolated sub networks.
220
+ def worker_api_get_status(self):
221
+ model_names = set()
222
+ speed = 0
223
+ queue_length = 0
224
+
225
+ for w_name in self.worker_info:
226
+ worker_status = self.get_worker_status(w_name)
227
+ if worker_status is not None:
228
+ model_names.update(worker_status["model_names"])
229
+ speed += worker_status["speed"]
230
+ queue_length += worker_status["queue_length"]
231
+
232
+ return {
233
+ "model_names": list(model_names),
234
+ "speed": speed,
235
+ "queue_length": queue_length,
236
+ }
237
+
238
+
239
+ app = FastAPI()
240
+
241
+
242
+ @app.post("/register_worker")
243
+ async def register_worker(request: Request):
244
+ data = await request.json()
245
+ controller.register_worker(
246
+ data["worker_name"], data["check_heart_beat"],
247
+ data.get("worker_status", None))
248
+
249
+
250
+ @app.post("/refresh_all_workers")
251
+ async def refresh_all_workers():
252
+ models = controller.refresh_all_workers()
253
+
254
+
255
+ @app.post("/list_models")
256
+ async def list_models():
257
+ models = controller.list_models()
258
+ return {"models": models}
259
+
260
+
261
+ @app.post("/get_worker_address")
262
+ async def get_worker_address(request: Request):
263
+ data = await request.json()
264
+ addr = controller.get_worker_address(data["model"])
265
+ return {"address": addr}
266
+
267
+
268
+ @app.post("/receive_heart_beat")
269
+ async def receive_heart_beat(request: Request):
270
+ data = await request.json()
271
+ exist = controller.receive_heart_beat(
272
+ data["worker_name"], data["queue_length"])
273
+ return {"exist": exist}
274
+
275
+
276
+ @app.post("/worker_generate_stream")
277
+ async def worker_api_generate_stream(request: Request):
278
+ params = await request.json()
279
+ generator = controller.worker_api_generate_stream(params)
280
+ return StreamingResponse(generator)
281
+
282
+
283
+ @app.post("/worker_get_status")
284
+ async def worker_api_get_status(request: Request):
285
+ return controller.worker_api_get_status()
286
+
287
+
288
+ if __name__ == "__main__":
289
+ parser = argparse.ArgumentParser()
290
+ parser.add_argument("--host", type=str, default="localhost")
291
+ parser.add_argument("--port", type=int, default=21001)
292
+ parser.add_argument("--dispatch-method", type=str, choices=[
293
+ "lottery", "shortest_queue"], default="shortest_queue")
294
+ args = parser.parse_args()
295
+ logger.info(f"args: {args}")
296
+
297
+ controller = Controller(args.dispatch_method)
298
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/examples/1034346401.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08b62a634fe49edc0a19fc53f6ea5cfb345d9b2a6a7047811344c16832dc42b2
3
+ size 1678095
videollama2/serve/examples/desert.jpg ADDED
videollama2/serve/examples/extreme_ironing.jpg ADDED
videollama2/serve/examples/sample_demo_1.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fc6562a172eb9cb3c760a3c9992349c1faa2c793c112b7b9e50bd5cb17c2164d
3
+ size 1549315
videollama2/serve/examples/sample_demo_3.mp4 ADDED
Binary file (464 kB). View file
 
videollama2/serve/examples/sample_demo_9.mp4 ADDED
Binary file (632 kB). View file
 
videollama2/serve/examples/waterview.jpg ADDED
videollama2/serve/gradio_web_server.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import time
4
+ import hashlib
5
+ import requests
6
+ import argparse
7
+ import datetime
8
+
9
+ import numpy as np
10
+ import gradio as gr
11
+ from decord import VideoReader, cpu
12
+
13
+ from videollama2.constants import LOGDIR, NUM_FRAMES
14
+ from videollama2.conversation import (default_conversation, conv_templates,SeparatorStyle)
15
+ from videollama2.utils import (build_logger, server_error_msg, violates_moderation, moderation_msg)
16
+
17
+
18
+ logger = build_logger("gradio_web_server", "gradio_web_server.log")
19
+
20
+ headers = {"User-Agent": "Videollama2 Client"}
21
+
22
+ no_change_btn = gr.Button.update()
23
+ enable_btn = gr.Button.update(interactive=True)
24
+ disable_btn = gr.Button.update(interactive=False)
25
+
26
+ priority = {
27
+ "vicuna-13b": "aaaaaaa",
28
+ "koala-13b": "aaaaaab",
29
+ }
30
+
31
+
32
+ def get_conv_log_filename():
33
+ t = datetime.datetime.now()
34
+ name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
35
+ return name
36
+
37
+
38
+ def get_model_list():
39
+ ret = requests.post(args.controller_url + "/refresh_all_workers")
40
+ assert ret.status_code == 200
41
+ ret = requests.post(args.controller_url + "/list_models")
42
+ models = ret.json()["models"]
43
+ models.sort(key=lambda x: priority.get(x, x))
44
+ logger.info(f"Models: {models}")
45
+ return models
46
+
47
+
48
+ get_window_url_params = """
49
+ function() {
50
+ const params = new URLSearchParams(window.location.search);
51
+ url_params = Object.fromEntries(params);
52
+ console.log(url_params);
53
+ return url_params;
54
+ }
55
+ """
56
+
57
+
58
+ def load_demo(url_params, request: gr.Request):
59
+ logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
60
+
61
+ dropdown_update = gr.Dropdown.update(visible=True)
62
+ if "model" in url_params:
63
+ model = url_params["model"]
64
+ if model in models:
65
+ dropdown_update = gr.Dropdown.update(
66
+ value=model, visible=True)
67
+
68
+ state = default_conversation.copy()
69
+ return state, dropdown_update
70
+
71
+
72
+ def load_demo_refresh_model_list(request: gr.Request):
73
+ logger.info(f"load_demo. ip: {request.client.host}")
74
+ models = get_model_list()
75
+ state = default_conversation.copy()
76
+ dropdown_update = gr.Dropdown.update(
77
+ choices=models,
78
+ value=models[0] if len(models) > 0 else ""
79
+ )
80
+ return state, dropdown_update
81
+
82
+
83
+ def vote_last_response(state, vote_type, model_selector, request: gr.Request):
84
+ with open(get_conv_log_filename(), "a") as fout:
85
+ data = {
86
+ "tstamp": round(time.time(), 4),
87
+ "type": vote_type,
88
+ "model": model_selector,
89
+ "state": state.dict(),
90
+ "ip": request.client.host,
91
+ }
92
+ fout.write(json.dumps(data) + "\n")
93
+
94
+
95
+ def upvote_last_response(state, model_selector, request: gr.Request):
96
+ logger.info(f"upvote. ip: {request.client.host}")
97
+ vote_last_response(state, "upvote", model_selector, request)
98
+ return ("",) + (disable_btn,) * 3
99
+
100
+
101
+ def downvote_last_response(state, model_selector, request: gr.Request):
102
+ logger.info(f"downvote. ip: {request.client.host}")
103
+ vote_last_response(state, "downvote", model_selector, request)
104
+ return ("",) + (disable_btn,) * 3
105
+
106
+
107
+ def flag_last_response(state, model_selector, request: gr.Request):
108
+ logger.info(f"flag. ip: {request.client.host}")
109
+ vote_last_response(state, "flag", model_selector, request)
110
+ return ("",) + (disable_btn,) * 3
111
+
112
+
113
+ def regenerate(state, image_process_mode, request: gr.Request):
114
+ logger.info(f"regenerate. ip: {request.client.host}")
115
+ state.messages[-1][-1] = None
116
+ prev_human_msg = state.messages[-2]
117
+ if type(prev_human_msg[1]) in (tuple, list):
118
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
119
+ state.skip_next = False
120
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
121
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
122
+
123
+
124
+ def clear_history(request: gr.Request):
125
+ logger.info(f"clear_history. ip: {request.client.host}")
126
+ state = default_conversation.copy()
127
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
128
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
129
+
130
+
131
+ def add_text_ori(state, text, image, video, image_process_mode, request: gr.Request):
132
+ # note: imagebox itself is PIL object while videobox is filepath
133
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
134
+ if len(text) <= 0 and image is None:
135
+ state.skip_next = True
136
+ return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
137
+ if args.moderate:
138
+ flagged = violates_moderation(text)
139
+ if flagged:
140
+ state.skip_next = True
141
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
142
+ no_change_btn,) * 5
143
+ assert image is None or video is None, "Please don't feed image and video inputs at the same time!!!"
144
+ text = text[:1536] # Hard cut-off
145
+ if image is not None:
146
+ # here image is the PIL object itself
147
+ text = text[:1200] # Hard cut-off for images
148
+ if '<image>' not in text:
149
+ # text = '<Image><image></Image>' + text
150
+ text = text + '\n<image>'
151
+ text = (text, image, image_process_mode)
152
+ if len(state.get_images(return_pil=True)) > 0:
153
+ state = default_conversation.copy()
154
+ state.modality = "image"
155
+ if video is not None:
156
+ print("Video box:", video)
157
+ # here video is the file path of video
158
+ text = text[:1200] # Hard cut-off for images
159
+ if '<video>' not in text:
160
+ # text = '<Image><image></Image>' + text
161
+ text = text + '\n<video>'
162
+ text = (text, video, image_process_mode)
163
+ if len(state.get_videos(return_pil=True)) > 0:
164
+ state = default_conversation.copy()
165
+ state.modality = "video"
166
+ print("Set modality as video...")
167
+ state.append_message(state.roles[0], text)
168
+ state.append_message(state.roles[1], None)
169
+ state.skip_next = False
170
+ # (state, chatbot, textbox, imagebox, videobox, upvote, downvote, flag, generate, clear)
171
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
172
+
173
+
174
+ def add_text(state, text, image, video, image_process_mode, request: gr.Request):
175
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
176
+
177
+ # if input is new video or image ,reset the state
178
+ if image is not None or video is not None:
179
+ state = default_conversation.copy()
180
+
181
+ if len(text) <= 0 and image is None and video is None:
182
+ state.skip_next = True
183
+ return (state, state.to_gradio_chatbot(), "", None, None) + (no_change_btn,) * 5
184
+
185
+ if args.moderate:
186
+ flagged = violates_moderation(text)
187
+ if flagged:
188
+ state.skip_next = True
189
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (no_change_btn,) * 5
190
+
191
+ # process the input video
192
+ if video is not None:
193
+ text = text[:1200] #
194
+ if '<video>' not in text:
195
+ text = text + '\n<video>'
196
+ text = (text, video, image_process_mode)
197
+ state.modality = "video"
198
+ # process the input image
199
+ elif image is not None:
200
+ text = text[:1200] #
201
+ if '<image>' not in text:
202
+ text = text + '\n<image>'
203
+ text = (text, image, image_process_mode)
204
+ state.modality = "image"
205
+ elif state.modality == "image" and len(text)>0:
206
+ state.modality = "image_text"
207
+ text = text[:1536] # Hard cut-off
208
+ elif state.modality == "video" and len(text)>0:
209
+ state.modality = "video_text"
210
+ text = text[:1536] # Hard cut-off
211
+
212
+ state.append_message(state.roles[0], text)
213
+ state.append_message(state.roles[1], None)
214
+ state.skip_next = False
215
+
216
+ return (state, state.to_gradio_chatbot(), "", None, None) + (disable_btn,) * 5
217
+
218
+
219
+ def http_bot(state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request):
220
+ logger.info(f"http_bot. ip: {request.client.host}")
221
+ start_tstamp = time.time()
222
+ model_name = model_selector
223
+
224
+ if state.skip_next:
225
+ # This generate call is skipped due to invalid inputs
226
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
227
+ return
228
+
229
+ if len(state.messages) == state.offset + 2:
230
+ # First round of conversation
231
+ if "llava" in model_name.lower():
232
+ if 'llama-2' in model_name.lower():
233
+ template_name = "llava_llama2"
234
+ elif "v1" in model_name.lower():
235
+ if 'mmtag' in model_name.lower():
236
+ template_name = "v1_mmtag"
237
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
238
+ template_name = "v1_mmtag"
239
+ else:
240
+ template_name = "llava_v1"
241
+ else:
242
+ if 'mmtag' in model_name.lower():
243
+ template_name = "v0_mmtag"
244
+ elif 'plain' in model_name.lower() and 'finetune' not in model_name.lower():
245
+ template_name = "v0_mmtag"
246
+ else:
247
+ template_name = "llava_v0"
248
+ elif "llama-2" in model_name:
249
+ template_name = "llama2"
250
+ else:
251
+ template_name = "vicuna_v1"
252
+ template_name = "llava_v1"
253
+ new_state = conv_templates[template_name].copy()
254
+ new_state.append_message(new_state.roles[0], state.messages[-2][1])
255
+ new_state.append_message(new_state.roles[1], None)
256
+ new_state.modality = state.modality
257
+ state = new_state
258
+
259
+ # Query worker address
260
+ controller_url = args.controller_url
261
+ ret = requests.post(controller_url + "/get_worker_address",
262
+ json={"model": model_name})
263
+ worker_addr = ret.json()["address"]
264
+ logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
265
+
266
+ # No available worker
267
+ if worker_addr == "":
268
+ state.messages[-1][-1] = server_error_msg
269
+ yield (state, state.to_gradio_chatbot(), disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
270
+ return
271
+
272
+ # Construct prompt
273
+ prompt = state.get_prompt()
274
+ if state.modality == "image" or state.modality == "image_text":
275
+ all_images = state.get_images(return_pil=True) # return PIL.Image object
276
+ elif state.modality == "video" or state.modality == "video_text":
277
+ all_images = state.get_videos(return_pil=True) # return video frames where each frame is a PIL.Image object
278
+ all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
279
+ for idx, (image, hash) in enumerate(zip(all_images, all_image_hash)):
280
+ t = datetime.datetime.now()
281
+ if state.modality == "image" or state.modality == "image_text":
282
+ filename = os.path.join(LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg")
283
+ elif state.modality == "video" or state.modality == "video_text":
284
+ filename = os.path.join(LOGDIR, "serve_videos", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}_{idx}.jpg")
285
+ if not os.path.isfile(filename):
286
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
287
+ image.save(filename)
288
+
289
+ # Make requests
290
+ pload = {
291
+ "model": model_name,
292
+ "prompt": prompt,
293
+ "temperature": float(temperature),
294
+ "top_p": float(top_p),
295
+ "max_new_tokens": min(int(max_new_tokens), 1536),
296
+ "stop": state.sep if state.sep_style in [SeparatorStyle.SINGLE] else state.sep2,
297
+ #"images": f'List of {len(state.get_images())} images: {all_image_hash}',
298
+ "images": f'List of {len(all_image_hash)} images: {all_image_hash}',
299
+ }
300
+ logger.info(f"==== request ====\n{pload}")
301
+
302
+ if state.modality == "image" or state.modality == "image_text":
303
+ pload['images'] = state.get_images()
304
+ elif state.modality == "video" or state.modality == "video_text":
305
+ pload['images'] = state.get_videos()
306
+
307
+ state.messages[-1][-1] = "▌"
308
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
309
+
310
+ try:
311
+ # Stream output
312
+ response = requests.post(worker_addr + "/worker_generate_stream",
313
+ headers=headers, json=pload, stream=True, timeout=10)
314
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
315
+ if chunk:
316
+ data = json.loads(chunk.decode())
317
+ if data["error_code"] == 0:
318
+ output = data["text"][len(prompt):].strip()
319
+ state.messages[-1][-1] = output + "▌"
320
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
321
+ else:
322
+ output = data["text"] + f" (error_code: {data['error_code']})"
323
+ state.messages[-1][-1] = output
324
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
325
+ return
326
+ time.sleep(0.03)
327
+ except requests.exceptions.RequestException as e:
328
+ state.messages[-1][-1] = server_error_msg
329
+ yield (state, state.to_gradio_chatbot()) + (disable_btn, disable_btn, disable_btn, enable_btn, enable_btn)
330
+ return
331
+
332
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
333
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
334
+
335
+ finish_tstamp = time.time()
336
+ logger.info(f"{output}")
337
+
338
+ with open(get_conv_log_filename(), "a") as fout:
339
+ data = {
340
+ "tstamp": round(finish_tstamp, 4),
341
+ "type": "chat",
342
+ "model": model_name,
343
+ "start": round(start_tstamp, 4),
344
+ "finish": round(start_tstamp, 4),
345
+ #"state": state.dict(),
346
+ "images": all_image_hash,
347
+ "ip": request.client.host,
348
+ }
349
+ fout.write(json.dumps(data) + "\n")
350
+
351
+ title_markdown = ("""
352
+ # The publicl release of VideoLLaMA2
353
+ """)
354
+
355
+ tos_markdown = ("""
356
+ ### Terms of use
357
+ By using this service, users are required to agree to the following terms:
358
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
359
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
360
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
361
+ """)
362
+
363
+
364
+ learn_more_markdown = ("""
365
+ ### License
366
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
367
+ """)
368
+
369
+ block_css = """
370
+
371
+ #buttons button {
372
+ min-width: min(120px,100%);
373
+ }
374
+
375
+ """
376
+
377
+ def build_demo(embed_mode):
378
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
379
+ with gr.Blocks(title="Video-Llama", theme=gr.themes.Default(), css=block_css) as demo:
380
+ state = gr.State()
381
+
382
+ if not embed_mode:
383
+ gr.Markdown(title_markdown)
384
+
385
+ with gr.Row():
386
+ with gr.Column(scale=3):
387
+ with gr.Row(elem_id="model_selector_row"):
388
+ model_selector = gr.Dropdown(
389
+ choices=models,
390
+ value=models[0] if len(models) > 0 else "",
391
+ interactive=True,
392
+ show_label=False,
393
+ container=False)
394
+
395
+ imagebox = gr.Image(type="pil")
396
+ videobox = gr.Video()
397
+ image_process_mode = gr.Radio(
398
+ ["Crop", "Resize", "Pad", "Default"],
399
+ value="Default",
400
+ label="Preprocess for non-square image", visible=False)
401
+
402
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
403
+ gr.Examples(examples=[
404
+ [f"{cur_dir}/examples/extreme_ironing.jpg", "What is unusual about this image?"],
405
+ [f"{cur_dir}/examples/waterview.jpg", "What are the things I should be cautious about when I visit here?"],
406
+ [f"{cur_dir}/examples/desert.jpg", "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?"],
407
+ ], inputs=[imagebox, textbox], label="Image examples")
408
+
409
+ # video example inputs
410
+ gr.Examples(examples=[
411
+ [f"{cur_dir}/examples/sample_demo_1.mp4", "Why is this video funny?"],
412
+ [f"{cur_dir}/examples/sample_demo_3.mp4", "Can you identify any safety hazards in this video?"],
413
+ [f"{cur_dir}/examples/1034346401.mp4", "What is this young woman doing?"]
414
+ ], inputs=[videobox, textbox], label="Video examples")
415
+ #[f"{cur_dir}/examples/sample_demo_9.mp4", "Describe the video in detail and please do not generate repetitive content."]
416
+
417
+ with gr.Accordion("Parameters", open=False) as parameter_row:
418
+ temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.2, step=0.1, interactive=True, label="Temperature",)
419
+ top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, interactive=True, label="Top P",)
420
+ max_output_tokens = gr.Slider(minimum=0, maximum=1024, value=512, step=64, interactive=True, label="Max output tokens",)
421
+
422
+ with gr.Column(scale=8):
423
+ chatbot = gr.Chatbot(elem_id="chatbot", label="Videollama2 Chatbot", height=550)
424
+ with gr.Row():
425
+ with gr.Column(scale=8):
426
+ textbox.render()
427
+ with gr.Column(scale=1, min_width=50):
428
+ submit_btn = gr.Button(value="Send", variant="primary")
429
+ with gr.Row(elem_id="buttons") as button_row:
430
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
431
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
432
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
433
+ #stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
434
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
435
+ clear_btn = gr.Button(value="🗑️ Clear", interactive=False)
436
+
437
+ if not embed_mode:
438
+ gr.Markdown(tos_markdown)
439
+ gr.Markdown(learn_more_markdown)
440
+ url_params = gr.JSON(visible=False)
441
+
442
+ # Register listeners
443
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
444
+ upvote_btn.click(upvote_last_response,
445
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
446
+ downvote_btn.click(downvote_last_response,
447
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
448
+ flag_btn.click(flag_last_response,
449
+ [state, model_selector], [textbox, upvote_btn, downvote_btn, flag_btn])
450
+ regenerate_btn.click(regenerate, [state, image_process_mode],
451
+ [state, chatbot, textbox, imagebox, videobox] + btn_list).then(
452
+ http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
453
+ [state, chatbot] + btn_list)
454
+ clear_btn.click(clear_history, None, [state, chatbot, textbox, imagebox, videobox] + btn_list)
455
+
456
+ textbox.submit(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
457
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
458
+ [state, chatbot] + btn_list)
459
+ submit_btn.click(add_text, [state, textbox, imagebox, videobox, image_process_mode], [state, chatbot, textbox, imagebox, videobox] + btn_list
460
+ ).then(http_bot, [state, model_selector, temperature, top_p, max_output_tokens],
461
+ [state, chatbot] + btn_list)
462
+
463
+ if args.model_list_mode == "once":
464
+ demo.load(load_demo, [url_params], [state, model_selector],
465
+ _js=get_window_url_params)
466
+ elif args.model_list_mode == "reload":
467
+ demo.load(load_demo_refresh_model_list, None, [state, model_selector])
468
+ else:
469
+ raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
470
+
471
+ return demo
472
+
473
+
474
+ if __name__ == "__main__":
475
+ parser = argparse.ArgumentParser()
476
+ parser.add_argument("--host", type=str, default="0.0.0.0")
477
+ parser.add_argument("--port", type=int)
478
+ parser.add_argument("--controller-url", type=str, default="http://localhost:21001")
479
+ parser.add_argument("--concurrency-count", type=int, default=10)
480
+ parser.add_argument("--model-list-mode", type=str, default="once",
481
+ choices=["once", "reload"])
482
+ parser.add_argument("--share", action="store_true")
483
+ parser.add_argument("--moderate", action="store_true")
484
+ parser.add_argument("--embed", action="store_true")
485
+ args = parser.parse_args()
486
+ logger.info(f"args: {args}")
487
+
488
+ models = get_model_list()
489
+
490
+ logger.info(args)
491
+ demo = build_demo(args.embed)
492
+ demo.queue(
493
+ concurrency_count=args.concurrency_count,
494
+ api_open=False
495
+ ).launch(
496
+ server_name=args.host,
497
+ server_port=args.port,
498
+ share=args.share
499
+ )
videollama2/serve/gradio_web_server_adhoc.py ADDED
@@ -0,0 +1,312 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+
3
+ import os
4
+ import re
5
+
6
+ import torch
7
+ import gradio as gr
8
+
9
+ import sys
10
+ sys.path.append('./')
11
+ from videollama2 import model_init, mm_infer
12
+ from videollama2.utils import disable_torch_init
13
+
14
+
15
+ title_markdown = ("""
16
+ <div style="display: flex; justify-content: center; align-items: center; text-align: center;">
17
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2" style="margin-right: 20px; text-decoration: none; display: flex; align-items: center;">
18
+ <img src="https://s2.loli.net/2024/06/03/D3NeXHWy5az9tmT.png" alt="VideoLLaMA 2 🔥🚀🔥" style="max-width: 120px; height: auto;">
19
+ </a>
20
+ <div>
21
+ <h1 >VideoLLaMA 2: Advancing Spatial-Temporal Modeling and Audio Understanding in Video-LLMs</h1>
22
+ <h5 style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
23
+ </div>
24
+ </div>
25
+
26
+
27
+ <div align="center">
28
+ <div style="display:flex; gap: 0.25rem; margin-top: 10px;" align="center">
29
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2"><img src='https://img.shields.io/badge/Github-VideoLLaMA2-9C276A'></a>
30
+ <a href="https://arxiv.org/pdf/2406.07476.pdf"><img src="https://img.shields.io/badge/Arxiv-2406.07476-AD1C18"></a>
31
+ <a href="https://github.com/DAMO-NLP-SG/VideoLLaMA2/stargazers"><img src="https://img.shields.io/github/stars/DAMO-NLP-SG/VideoLLaMA2.svg?style=social"></a>
32
+ </div>
33
+ </div>
34
+ """)
35
+
36
+
37
+ block_css = """
38
+ #buttons button {
39
+ min-width: min(120px,100%);
40
+ color: #9C276A
41
+ }
42
+ """
43
+
44
+
45
+ tos_markdown = ("""
46
+ ### Terms of use
47
+ By using this service, users are required to agree to the following terms:
48
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
49
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
50
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
51
+ """)
52
+
53
+
54
+ learn_more_markdown = ("""
55
+ ### License
56
+ This project is released under the Apache 2.0 license as found in the LICENSE file. The service is a research preview intended for non-commercial use ONLY, subject to the model Licenses of LLaMA and Mistral, Terms of Use of the data generated by OpenAI, and Privacy Practices of ShareGPT. Please get in touch with us if you find any potential violations.
57
+ """)
58
+
59
+
60
+ plum_color = gr.themes.colors.Color(
61
+ name='plum',
62
+ c50='#F8E4EF',
63
+ c100='#E9D0DE',
64
+ c200='#DABCCD',
65
+ c300='#CBA8BC',
66
+ c400='#BC94AB',
67
+ c500='#AD809A',
68
+ c600='#9E6C89',
69
+ c700='#8F5878',
70
+ c800='#804467',
71
+ c900='#713056',
72
+ c950='#662647',
73
+ )
74
+
75
+
76
+ class Chat:
77
+
78
+ def __init__(self, model_path, load_8bit=False, load_4bit=False):
79
+ disable_torch_init()
80
+
81
+ self.model, self.processor, self.tokenizer = model_init(model_path, load_8bit=load_8bit, load_4bit=load_4bit)
82
+
83
+ @spaces.GPU(duration=120)
84
+ @torch.inference_mode()
85
+ def generate(self, data: list, message, temperature, top_p, max_output_tokens):
86
+ # TODO: support multiple turns of conversation.
87
+ assert len(data) == 1
88
+
89
+ tensor, modal = data[0]
90
+ response = mm_infer(tensor, message, self.model, self.tokenizer, modal=modal.strip('<>'),
91
+ do_sample=True if temperature > 0.0 else False,
92
+ temperature=temperature,
93
+ top_p=top_p,
94
+ max_new_tokens=max_output_tokens)
95
+
96
+ return response
97
+
98
+
99
+ @spaces.GPU(duration=120)
100
+ def generate(image, video, message, chatbot, textbox_in, temperature, top_p, max_output_tokens, dtype=torch.float16):
101
+ data = []
102
+
103
+ processor = handler.processor
104
+ try:
105
+ if image is not None:
106
+ data.append((processor['image'](image).to(handler.model.device, dtype=dtype), '<image>'))
107
+ elif video is not None:
108
+ data.append((processor['video'](video).to(handler.model.device, dtype=dtype), '<video>'))
109
+ elif image is None and video is None:
110
+ data.append((None, '<text>'))
111
+ else:
112
+ raise NotImplementedError("Not support image and video at the same time")
113
+ except Exception as e:
114
+ traceback.print_exc()
115
+ return gr.update(value=None, interactive=True), gr.update(value=None, interactive=True), message, chatbot
116
+
117
+ assert len(message) % 2 == 0, "The message should be a pair of user and system message."
118
+
119
+ show_images = ""
120
+ if image is not None:
121
+ show_images += f'<img src="./file={image}" style="display: inline-block;width: 250px;max-height: 400px;">'
122
+ if video is not None:
123
+ show_images += f'<video controls playsinline width="500" style="display: inline-block;" src="./file={video}"></video>'
124
+
125
+ one_turn_chat = [textbox_in, None]
126
+
127
+ # 1. first run case
128
+ if len(chatbot) == 0:
129
+ one_turn_chat[0] += "\n" + show_images
130
+ # 2. not first run case
131
+ else:
132
+ previous_image = re.findall(r'<img src="./file=(.+?)"', chatbot[0][0])
133
+ previous_video = re.findall(r'<video controls playsinline width="500" style="display: inline-block;" src="./file=(.+?)"', chatbot[0][0])
134
+ if len(previous_image) > 0:
135
+ previous_image = previous_image[0]
136
+ # 2.1 new image append or pure text input will start a new conversation
137
+ if image is not None and os.path.basename(previous_image) != os.path.basename(image):
138
+ message.clear()
139
+ one_turn_chat[0] += "\n" + show_images
140
+ elif len(previous_video) > 0:
141
+ previous_video = previous_video[0]
142
+ # 2.2 new video append or pure text input will start a new conversation
143
+ if video is not None and os.path.basename(previous_video) != os.path.basename(video):
144
+ message.clear()
145
+ one_turn_chat[0] += "\n" + show_images
146
+
147
+ message.append({'role': 'user', 'content': textbox_in})
148
+ text_en_out = handler.generate(data, message, temperature=temperature, top_p=top_p, max_output_tokens=max_output_tokens)
149
+ message.append({'role': 'assistant', 'content': text_en_out})
150
+
151
+ one_turn_chat[1] = text_en_out
152
+ chatbot.append(one_turn_chat)
153
+
154
+ return gr.update(value=image, interactive=True), gr.update(value=video, interactive=True), message, chatbot
155
+
156
+
157
+ def regenerate(message, chatbot):
158
+ message.pop(-1), message.pop(-1)
159
+ chatbot.pop(-1)
160
+ return message, chatbot
161
+
162
+
163
+ def clear_history(message, chatbot):
164
+ message.clear(), chatbot.clear()
165
+ return (gr.update(value=None, interactive=True),
166
+ gr.update(value=None, interactive=True),
167
+ message, chatbot,
168
+ gr.update(value=None, interactive=True))
169
+
170
+
171
+ # BUG of Zero Environment
172
+ # 1. The environment is fixed to torch>=2.0,<=2.2, gradio>=4.x.x
173
+ # 2. The operation or tensor which requires cuda are limited in those functions wrapped via spaces.GPU
174
+ # 3. The function can't return tensor or other cuda objects.
175
+
176
+ model_path = 'DAMO-NLP-SG/VideoLLaMA2-7B-16F'
177
+
178
+ handler = Chat(model_path, load_8bit=False, load_4bit=True)
179
+
180
+ textbox = gr.Textbox(show_label=False, placeholder="Enter text and press ENTER", container=False)
181
+
182
+ theme = gr.themes.Default(primary_hue=plum_color)
183
+ # theme.update_color("primary", plum_color.c500)
184
+ theme.set(slider_color="#9C276A")
185
+ theme.set(block_title_text_color="#9C276A")
186
+ theme.set(block_label_text_color="#9C276A")
187
+ theme.set(button_primary_text_color="#9C276A")
188
+ # theme.set(button_secondary_text_color="*neutral_800")
189
+
190
+
191
+ with gr.Blocks(title='VideoLLaMA 2 🔥🚀🔥', theme=theme, css=block_css) as demo:
192
+ gr.Markdown(title_markdown)
193
+ message = gr.State([])
194
+
195
+ with gr.Row():
196
+ with gr.Column(scale=3):
197
+ image = gr.Image(label="Input Image", type="filepath")
198
+ video = gr.Video(label="Input Video")
199
+
200
+ with gr.Accordion("Parameters", open=True) as parameter_row:
201
+ # num_beams = gr.Slider(
202
+ # minimum=1,
203
+ # maximum=10,
204
+ # value=1,
205
+ # step=1,
206
+ # interactive=True,
207
+ # label="beam search numbers",
208
+ # )
209
+
210
+ temperature = gr.Slider(
211
+ minimum=0.1,
212
+ maximum=1.0,
213
+ value=0.2,
214
+ step=0.1,
215
+ interactive=True,
216
+ label="Temperature",
217
+ )
218
+
219
+ top_p = gr.Slider(
220
+ minimum=0.0,
221
+ maximum=1.0,
222
+ value=0.7,
223
+ step=0.1,
224
+ interactive=True,
225
+ label="Top P",
226
+ )
227
+
228
+ max_output_tokens = gr.Slider(
229
+ minimum=64,
230
+ maximum=1024,
231
+ value=512,
232
+ step=64,
233
+ interactive=True,
234
+ label="Max output tokens",
235
+ )
236
+
237
+ with gr.Column(scale=7):
238
+ chatbot = gr.Chatbot(label="VideoLLaMA 2", bubble_full_width=True, height=750)
239
+ with gr.Row():
240
+ with gr.Column(scale=8):
241
+ textbox.render()
242
+ with gr.Column(scale=1, min_width=50):
243
+ submit_btn = gr.Button(value="Send", variant="primary", interactive=True)
244
+ with gr.Row(elem_id="buttons") as button_row:
245
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=True)
246
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=True)
247
+ # flag_btn = gr.Button(value="⚠️ Flag", interactive=True)
248
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
249
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=True)
250
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=True)
251
+
252
+ with gr.Row():
253
+ with gr.Column():
254
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
255
+ gr.Examples(
256
+ examples=[
257
+ [
258
+ f"{cur_dir}/examples/extreme_ironing.jpg",
259
+ "What happens in this image?",
260
+ ],
261
+ [
262
+ f"{cur_dir}/examples/waterview.jpg",
263
+ "What are the things I should be cautious about when I visit here?",
264
+ ],
265
+ [
266
+ f"{cur_dir}/examples/desert.jpg",
267
+ "If there are factual errors in the questions, point it out; if not, proceed answering the question. What’s happening in the desert?",
268
+ ],
269
+ ],
270
+ inputs=[image, textbox],
271
+ )
272
+ with gr.Column():
273
+ gr.Examples(
274
+ examples=[
275
+ [
276
+ f"{cur_dir}/../../assets/cat_and_chicken.mp4",
277
+ "What happens in this video?",
278
+ ],
279
+ [
280
+ f"{cur_dir}/../../assets/sora.mp4",
281
+ "Please describe this video.",
282
+ ],
283
+ [
284
+ f"{cur_dir}/examples/sample_demo_1.mp4",
285
+ "What does the baby do?",
286
+ ],
287
+ ],
288
+ inputs=[video, textbox],
289
+ )
290
+
291
+ gr.Markdown(tos_markdown)
292
+ gr.Markdown(learn_more_markdown)
293
+
294
+ submit_btn.click(
295
+ generate,
296
+ [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
297
+ [image, video, message, chatbot])
298
+
299
+ regenerate_btn.click(
300
+ regenerate,
301
+ [message, chatbot],
302
+ [message, chatbot]).then(
303
+ generate,
304
+ [image, video, message, chatbot, textbox, temperature, top_p, max_output_tokens],
305
+ [image, video, message, chatbot])
306
+
307
+ clear_btn.click(
308
+ clear_history,
309
+ [message, chatbot],
310
+ [image, video, message, chatbot, textbox])
311
+
312
+ demo.launch()
videollama2/serve/model_worker.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import os
5
+ import json
6
+ import time
7
+ import uuid
8
+ import asyncio
9
+ import requests
10
+ import argparse
11
+ import threading
12
+ from threading import Thread
13
+ from functools import partial
14
+ from typing import Iterator, List, Optional, Tuple
15
+
16
+ import uvicorn
17
+ from fastapi import FastAPI, Request, BackgroundTasks
18
+ from fastapi.responses import StreamingResponse
19
+
20
+ import torch
21
+ import decord
22
+ import numpy as np
23
+ from PIL import Image
24
+ from decord import VideoReader, cpu
25
+ from transformers import TextIteratorStreamer
26
+
27
+ from videollama2.constants import WORKER_HEART_BEAT_INTERVAL
28
+ from videollama2.utils import (build_logger, server_error_msg, pretty_print_semaphore)
29
+ from videollama2.model.builder import load_pretrained_model
30
+ from videollama2.mm_utils import process_images, process_videos, load_image_from_base64, tokenizer_image_token, KeywordsStoppingCriteria, tokenizer_MMODAL_token
31
+ from videollama2.mm_utils import chunk_list, frame_expansion
32
+ from videollama2.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VIDEO_TOKEN, NUM_FRAMES, MMODAL_TOKEN_INDEX
33
+
34
+
35
+ GB = 1 << 30
36
+
37
+ worker_id = str(uuid.uuid4())[:6]
38
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
39
+ global_counter = 0
40
+
41
+ model_semaphore = None
42
+
43
+
44
+ # variable_content = os.getenv('MY_VARIABLE', '')
45
+ # KEYWORDS_LIST = set(variable_content.split('\n'))
46
+ KEYWORDS_LIST = []
47
+ path = 'assets/keywords.txt'
48
+ if os.path.exists(path):
49
+ with open(path, 'r', encoding='utf-8') as file:
50
+ for line in file:
51
+
52
+ KEYWORDS_LIST.append(line.strip())
53
+ else:
54
+ KEYWORDS_LIST = []
55
+
56
+
57
+ KEYWORD_BLOCK_MESSAGE2 = "The output contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
58
+ KEYWORD_BLOCK_MESSAGE1 = "Your input question contains political, erotic and other unsafe content that violates local laws. Please re-enter your question."
59
+ STREAM_CHECK_MULTIPLE = 20
60
+
61
+
62
+ def heart_beat_worker(controller):
63
+
64
+ while True:
65
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
66
+ controller.send_heart_beat()
67
+
68
+
69
+ def safety_check(text, history=None, ) -> Optional[str]:
70
+
71
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
72
+ print('############')
73
+ return KEYWORD_BLOCK_MESSAGE2
74
+
75
+ return None
76
+
77
+
78
+ def input_safety_check(text) -> Optional[str]:
79
+ if len(KEYWORDS_LIST) > 0 and any(x in text.lower() for x in KEYWORDS_LIST):
80
+ print('######## Input keyword alarm triggered:', text)
81
+ return KEYWORD_BLOCK_MESSAGE1
82
+ return None
83
+
84
+
85
+ class ModelWorker:
86
+
87
+ def __init__(self, controller_addr, worker_addr,
88
+ worker_id, no_register,
89
+ model_path, model_base, model_name,
90
+ load_8bit, load_4bit, device):
91
+ self.controller_addr = controller_addr
92
+ self.worker_addr = worker_addr
93
+ self.worker_id = worker_id
94
+ self.model_path = model_path
95
+ if model_path.endswith("/"):
96
+ model_path = model_path[:-1]
97
+ if model_name is None:
98
+ model_paths = model_path.split("/")
99
+ if model_paths[-1].startswith('checkpoint-'):
100
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
101
+ else:
102
+ self.model_name = model_paths[-1]
103
+ else:
104
+ self.model_name = model_name
105
+
106
+ self.device = device
107
+ logger.info(f"Loading the model {self.model_name} on worker {worker_id} ...")
108
+ self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(
109
+ model_path, model_base, self.model_name, load_8bit, load_4bit, device=self.device)
110
+ self.is_multimodal = 'videollama2' in self.model_name.lower() or 'vlb' in self.model_name.lower()
111
+
112
+ if not no_register:
113
+ self.register_to_controller()
114
+ self.heart_beat_thread = threading.Thread(
115
+ target=heart_beat_worker, args=(self,))
116
+ self.heart_beat_thread.start()
117
+
118
+ def register_to_controller(self):
119
+ logger.info("Register to controller")
120
+
121
+ url = self.controller_addr + "/register_worker"
122
+ data = {
123
+ "worker_name": self.worker_addr,
124
+ "check_heart_beat": True,
125
+ "worker_status": self.get_status()
126
+ }
127
+ r = requests.post(url, json=data)
128
+ assert r.status_code == 200
129
+
130
+ def send_heart_beat(self):
131
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
132
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
133
+ f"global_counter: {global_counter}")
134
+
135
+ url = self.controller_addr + "/receive_heart_beat"
136
+
137
+ while True:
138
+ try:
139
+ ret = requests.post(url, json={
140
+ "worker_name": self.worker_addr,
141
+ "queue_length": self.get_queue_length()}, timeout=5)
142
+ exist = ret.json()["exist"]
143
+ break
144
+ except requests.exceptions.RequestException as e:
145
+ logger.error(f"heart beat error: {e}")
146
+ time.sleep(5)
147
+
148
+ if not exist:
149
+ self.register_to_controller()
150
+
151
+ def get_queue_length(self):
152
+ if model_semaphore is None:
153
+ return 0
154
+ else:
155
+ return args.limit_model_concurrency - model_semaphore._value + (len(
156
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
157
+
158
+ def get_status(self):
159
+ return {
160
+ "model_names": [self.model_name],
161
+ "speed": 1,
162
+ "queue_length": self.get_queue_length(),
163
+ }
164
+
165
+ @torch.inference_mode()
166
+ def generate_stream(self, params):
167
+ tokenizer, model, image_processor = self.tokenizer, self.model, self.image_processor
168
+
169
+ prompt = params["prompt"]
170
+ ori_prompt = prompt
171
+ images_or_videos = params.get("images", None)
172
+ #print("Input images:", images_or_videos)
173
+ num_image_tokens = 0
174
+ modal_list = []
175
+ if images_or_videos is not None and len(images_or_videos) and self.is_multimodal:
176
+ if len(images_or_videos) > 0:
177
+ if len(images_or_videos) != prompt.count(DEFAULT_IMAGE_TOKEN) and len(images_or_videos) != (prompt.count(DEFAULT_VIDEO_TOKEN)):
178
+ raise ValueError("Number of images/videos does not match number of <image>/<video> tokens in prompt")
179
+
180
+ try:
181
+ print("Load image...")
182
+ images_or_videos = [load_image_from_base64(image) for image in images_or_videos]
183
+ images_or_videos = process_images(images_or_videos, image_processor, model.config)
184
+
185
+ modal_list = ["image"]
186
+ replace_token = DEFAULT_IMAGE_TOKEN
187
+ modal_token_index = MMODAL_TOKEN_INDEX["IMAGE"]
188
+ except:
189
+ print("Load video instead...")
190
+ decord_vr = VideoReader(uri=images_or_videos[0], ctx=cpu(0))
191
+ duration = len(decord_vr)
192
+ if not "use_taug" in self.model_path:
193
+ frame_id_list = np.linspace(0, duration-1, 8, dtype=int)
194
+ video_frames = decord_vr.get_batch(frame_id_list).asnumpy()
195
+ images_or_videos = process_videos(video_frames, image_processor, model.config)
196
+ else:
197
+ print("Temporal augmentation activated!!!")
198
+ frame_id_list = np.linspace(0, duration-1, 8 * 2 * 2, dtype=int)
199
+ video_data = decord_vr.get_batch(frame_id_list)
200
+ video_frames = [Image.fromarray(f) for f in video_data.asnumpy()]
201
+ chunked_video_frames = chunk_list(video_frames, 2*2)
202
+ expanded_video_frames = [frame_expansion(frame_list, 2) for frame_list in chunked_video_frames]
203
+ images_or_videos = process_videos(expanded_video_frames, image_processor, model.config)
204
+
205
+ # frame_id_list = np.linspace(0, duration-1, NUM_FRAMES, dtype=int)
206
+ # images_or_videos = decord_vr.get_batch(frame_id_list).asnumpy()
207
+ # images_or_videos = process_videos(images_or_videos, image_processor, model.config)
208
+ #print("images_or_videos.shape:", images_or_videos.shape)
209
+ modal_list = ["video"]
210
+ replace_token = DEFAULT_VIDEO_TOKEN
211
+ modal_token_index = MMODAL_TOKEN_INDEX["VIDEO"]
212
+
213
+ if type(images_or_videos) is list:
214
+ images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
215
+ else:
216
+ images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
217
+ if modal_list[0] == "video":
218
+ print("Video:", images_or_videos.shape)
219
+ images_or_videos = [images_or_videos]
220
+ else:
221
+ print("Image:", images_or_videos.shape)
222
+
223
+
224
+ #image_sizes = [image.size for image in images_or_videos]
225
+
226
+
227
+ # if len(images_or_videos) % NUM_FRAMES == 0:
228
+ # images_or_videos = process_images(images_or_videos, image_processor, model.config)
229
+ # #images_or_videos = [image.to(self.model.device, dtype=torch.float16) for image in images_or_videos]
230
+ # #modal_list = ["image"] * len(images_or_videos)
231
+ # images_or_videos = images_or_videos.to(self.model.device, dtype=torch.float16)
232
+ # modal_list = ["video"]
233
+ # replace_token = DEFAULT_VIDEO_TOKEN
234
+ # else:
235
+
236
+ if getattr(self.model.config, 'mm_use_im_start_end', False):
237
+ replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
238
+ prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
239
+
240
+ num_image_tokens = prompt.count(replace_token) * model.get_vision_tower().num_patches
241
+ else:
242
+ images = None
243
+ modal_list = []
244
+ image_args = {"images_or_videos": images_or_videos, "modal_list": modal_list}
245
+ else:
246
+ images = None
247
+ image_args = {}
248
+ print("image_args:", image_args)
249
+ temperature = float(params.get("temperature", 1.0))
250
+ top_p = float(params.get("top_p", 1.0))
251
+ max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
252
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
253
+ stop_str = params.get("stop", None)
254
+ do_sample = True if temperature > 0.001 else False
255
+
256
+ #input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device)
257
+ # tokenizer for our video-llama beta
258
+ input_ids = tokenizer_MMODAL_token(prompt, tokenizer, modal_token_index, return_tensors='pt').unsqueeze(0).to(self.device)
259
+ #print("Current prompt:", prompt)
260
+ #print("input_ids.shape:", input_ids.shape)
261
+ keywords = [stop_str]
262
+ stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
263
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=15)
264
+
265
+ max_new_tokens = min(max_new_tokens, max_context_length - input_ids.shape[-1] - num_image_tokens)
266
+
267
+ if max_new_tokens < 1:
268
+ yield json.dumps({"text": ori_prompt + "Exceeds max token length. Please start a new conversation, thanks.", "error_code": 0}).encode() + b"\0"
269
+ return
270
+
271
+ thread = Thread(target=model.generate, kwargs=dict(
272
+ inputs=input_ids,
273
+ do_sample=do_sample,
274
+ temperature=temperature,
275
+ top_p=top_p,
276
+ max_new_tokens=max_new_tokens,
277
+ streamer=streamer,
278
+ stopping_criteria=[stopping_criteria],
279
+ use_cache=True,
280
+ **image_args
281
+ ))
282
+ thread.start()
283
+
284
+ generated_text = ori_prompt
285
+ token_count = 0
286
+ for new_text in streamer:
287
+ generated_text += new_text
288
+ token_count += len(tokenizer.encode(new_text))
289
+ if token_count >= STREAM_CHECK_MULTIPLE:
290
+ safety_message = safety_check(generated_text)
291
+ if safety_message:
292
+ print('####### Keyword alarm triggered:', generated_text)
293
+ yield json.dumps({"text": safety_message , "error_code": 1}).encode() + b"\0"
294
+ return
295
+ token_count = 0 #
296
+
297
+
298
+ if generated_text.endswith(stop_str):
299
+ generated_text = generated_text[:-len(stop_str)]
300
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
301
+
302
+ def generate_stream_gate(self, params):
303
+ try:
304
+ input_text = params.get("prompt", "")
305
+ safety_message = input_safety_check(input_text)
306
+ if safety_message:
307
+ yield json.dumps({"text": safety_message, "error_code": 1}).encode() + b"\0"
308
+ return
309
+
310
+ for x in self.generate_stream(params):
311
+ yield x
312
+ except ValueError as e:
313
+ print("Caught ValueError:", e)
314
+ ret = {
315
+ "text": server_error_msg,
316
+ "error_code": 1,
317
+ }
318
+ yield json.dumps(ret).encode() + b"\0"
319
+ except torch.cuda.CudaError as e:
320
+ print("Caught torch.cuda.CudaError:", e)
321
+ ret = {
322
+ "text": server_error_msg,
323
+ "error_code": 1,
324
+ }
325
+ yield json.dumps(ret).encode() + b"\0"
326
+ except Exception as e:
327
+ print("Caught Unknown Error", e)
328
+ ret = {
329
+ "text": server_error_msg,
330
+ "error_code": 1,
331
+ }
332
+ yield json.dumps(ret).encode() + b"\0"
333
+
334
+
335
+ app = FastAPI()
336
+
337
+
338
+ def release_model_semaphore(fn=None):
339
+ model_semaphore.release()
340
+ if fn is not None:
341
+ fn()
342
+
343
+
344
+ @app.post("/worker_generate_stream")
345
+ async def generate_stream(request: Request):
346
+ global model_semaphore, global_counter
347
+ global_counter += 1
348
+ params = await request.json()
349
+
350
+ if model_semaphore is None:
351
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
352
+ await model_semaphore.acquire()
353
+ worker.send_heart_beat()
354
+ generator = worker.generate_stream_gate(params)
355
+ background_tasks = BackgroundTasks()
356
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
357
+ return StreamingResponse(generator, background=background_tasks)
358
+
359
+
360
+ @app.post("/worker_get_status")
361
+ async def get_status(request: Request):
362
+ return worker.get_status()
363
+
364
+
365
+ if __name__ == "__main__":
366
+ parser = argparse.ArgumentParser()
367
+ parser.add_argument("--host", type=str, default="localhost")
368
+ parser.add_argument("--port", type=int, default=21002)
369
+ parser.add_argument("--worker-address", type=str, default="http://localhost:21002")
370
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
371
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
372
+ parser.add_argument("--model-base", type=str, default=None)
373
+ parser.add_argument("--model-name", type=str)
374
+ parser.add_argument("--device", type=str, default="cuda")
375
+ parser.add_argument("--multi-modal", action="store_true", help="Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
376
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
377
+ parser.add_argument("--stream-interval", type=int, default=1)
378
+ parser.add_argument("--no-register", action="store_true")
379
+ parser.add_argument("--load-8bit", action="store_true")
380
+ parser.add_argument("--load-4bit", action="store_true")
381
+ args = parser.parse_args()
382
+ logger.info(f"args: {args}")
383
+
384
+ if args.multi_modal:
385
+ logger.warning("Multimodal mode is automatically detected with model name, please make sure `llava` is included in the model path.")
386
+
387
+ worker = ModelWorker(args.controller_address,
388
+ args.worker_address,
389
+ worker_id,
390
+ args.no_register,
391
+ args.model_path,
392
+ args.model_base,
393
+ args.model_name,
394
+ args.load_8bit,
395
+ args.load_4bit,
396
+ args.device)
397
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/register_worker.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Manually register workers.
3
+
4
+ Usage:
5
+ python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6
+ """
7
+
8
+ import argparse
9
+
10
+ import requests
11
+
12
+ if __name__ == "__main__":
13
+ parser = argparse.ArgumentParser()
14
+ parser.add_argument("--controller-address", type=str)
15
+ parser.add_argument("--worker-name", type=str)
16
+ parser.add_argument("--check-heart-beat", action="store_true")
17
+ args = parser.parse_args()
18
+
19
+ url = args.controller_address + "/register_worker"
20
+ data = {
21
+ "worker_name": args.worker_name,
22
+ "check_heart_beat": args.check_heart_beat,
23
+ "worker_status": None,
24
+ }
25
+ r = requests.post(url, json=data)
26
+ assert r.status_code == 200
videollama2/serve/sglang_worker.py ADDED
@@ -0,0 +1,244 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A model worker executes the model.
3
+ """
4
+ import argparse
5
+ import asyncio
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ import json
8
+ import time
9
+ import threading
10
+ import uuid
11
+
12
+ from fastapi import FastAPI, Request, BackgroundTasks
13
+ from fastapi.responses import StreamingResponse
14
+ import requests
15
+ import re
16
+ import uvicorn
17
+ from functools import partial
18
+
19
+ from llava.constants import WORKER_HEART_BEAT_INTERVAL
20
+ from llava.utils import (build_logger, server_error_msg,
21
+ pretty_print_semaphore)
22
+ from llava.mm_utils import process_images, load_image_from_base64, tokenizer_image_token, expand2square
23
+ from llava.constants import DEFAULT_IMAGE_TOKEN
24
+
25
+ import sglang as sgl
26
+ from sglang.backend.runtime_endpoint import RuntimeEndpoint
27
+
28
+
29
+ GB = 1 << 30
30
+
31
+ worker_id = str(uuid.uuid4())[:6]
32
+ logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
33
+ global_counter = 0
34
+
35
+ model_semaphore = None
36
+
37
+
38
+ def heart_beat_worker(controller):
39
+ while True:
40
+ time.sleep(WORKER_HEART_BEAT_INTERVAL)
41
+ controller.send_heart_beat()
42
+
43
+
44
+ @sgl.function
45
+ def pipeline(s, prompt, max_tokens):
46
+ for p in prompt:
47
+ if type(p) is str:
48
+ s += p
49
+ else:
50
+ s += sgl.image(p)
51
+ s += sgl.gen("response", max_tokens=max_tokens)
52
+
53
+
54
+ class ModelWorker:
55
+ def __init__(self, controller_addr, worker_addr, sgl_endpoint,
56
+ worker_id, no_register, model_name):
57
+ self.controller_addr = controller_addr
58
+ self.worker_addr = worker_addr
59
+ self.worker_id = worker_id
60
+
61
+ # Select backend
62
+ backend = RuntimeEndpoint(sgl_endpoint)
63
+ sgl.set_default_backend(backend)
64
+ model_path = backend.model_info["model_path"]
65
+
66
+ if model_path.endswith("/"):
67
+ model_path = model_path[:-1]
68
+ if model_name is None:
69
+ model_paths = model_path.split("/")
70
+ if model_paths[-1].startswith('checkpoint-'):
71
+ self.model_name = model_paths[-2] + "_" + model_paths[-1]
72
+ else:
73
+ self.model_name = model_paths[-1]
74
+ else:
75
+ self.model_name = model_name
76
+
77
+ logger.info(f"Loading the SGLANG model {self.model_name} on worker {worker_id} ...")
78
+
79
+ if not no_register:
80
+ self.register_to_controller()
81
+ self.heart_beat_thread = threading.Thread(
82
+ target=heart_beat_worker, args=(self,), daemon=True)
83
+ self.heart_beat_thread.start()
84
+
85
+ def register_to_controller(self):
86
+ logger.info("Register to controller")
87
+
88
+ url = self.controller_addr + "/register_worker"
89
+ data = {
90
+ "worker_name": self.worker_addr,
91
+ "check_heart_beat": True,
92
+ "worker_status": self.get_status()
93
+ }
94
+ r = requests.post(url, json=data)
95
+ assert r.status_code == 200
96
+
97
+ def send_heart_beat(self):
98
+ logger.info(f"Send heart beat. Models: {[self.model_name]}. "
99
+ f"Semaphore: {pretty_print_semaphore(model_semaphore)}. "
100
+ f"global_counter: {global_counter}")
101
+
102
+ url = self.controller_addr + "/receive_heart_beat"
103
+
104
+ while True:
105
+ try:
106
+ ret = requests.post(url, json={
107
+ "worker_name": self.worker_addr,
108
+ "queue_length": self.get_queue_length()}, timeout=5)
109
+ exist = ret.json()["exist"]
110
+ break
111
+ except requests.exceptions.RequestException as e:
112
+ logger.error(f"heart beat error: {e}")
113
+ time.sleep(5)
114
+
115
+ if not exist:
116
+ self.register_to_controller()
117
+
118
+ def get_queue_length(self):
119
+ if model_semaphore is None:
120
+ return 0
121
+ else:
122
+ return args.limit_model_concurrency - model_semaphore._value + (len(
123
+ model_semaphore._waiters) if model_semaphore._waiters is not None else 0)
124
+
125
+ def get_status(self):
126
+ return {
127
+ "model_names": [self.model_name],
128
+ "speed": 1,
129
+ "queue_length": self.get_queue_length(),
130
+ }
131
+
132
+ async def generate_stream(self, params):
133
+ ori_prompt = prompt = params["prompt"]
134
+ images = params.get("images", None)
135
+ if images is not None and len(images) > 0:
136
+ if len(images) > 0:
137
+ if len(images) != prompt.count(DEFAULT_IMAGE_TOKEN):
138
+ raise ValueError("Number of images does not match number of <image> tokens in prompt")
139
+
140
+ images = [load_image_from_base64(image) for image in images]
141
+
142
+ # FIXME: for image-start/end token
143
+ # replace_token = DEFAULT_IMAGE_TOKEN
144
+ # if getattr(self.model.config, 'mm_use_im_start_end', False):
145
+ # replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
146
+ # prompt = prompt.replace(DEFAULT_IMAGE_TOKEN, replace_token)
147
+ prompt = prompt.replace(' ' + DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN)
148
+ prompt_split = prompt.split(DEFAULT_IMAGE_TOKEN)
149
+ prompt = []
150
+ for i in range(len(prompt_split)):
151
+ prompt.append(prompt_split[i])
152
+ if i < len(images):
153
+ prompt.append(images[i])
154
+ else:
155
+ prompt = [prompt]
156
+
157
+ temperature = float(params.get("temperature", 1.0))
158
+ top_p = float(params.get("top_p", 1.0))
159
+ # max_context_length = getattr(model.config, 'max_position_embeddings', 2048)
160
+ max_new_tokens = min(int(params.get("max_new_tokens", 256)), 1024)
161
+ stop_str = params.get("stop", None)
162
+ stop_str = [stop_str] if stop_str is not None else None
163
+
164
+ print({'prompt': prompt, 'max_new_tokens': max_new_tokens, 'temperature': temperature, 'top_p': top_p})
165
+ state = pipeline.run(prompt, max_new_tokens, temperature=temperature, top_p=top_p, stream=True)
166
+
167
+ generated_text = ori_prompt
168
+ async for text_outputs in state.text_async_iter(var_name="response"):
169
+ generated_text += text_outputs
170
+ yield json.dumps({"text": generated_text, "error_code": 0}).encode() + b"\0"
171
+
172
+ async def generate_stream_gate(self, params):
173
+ try:
174
+ async for x in self.generate_stream(params):
175
+ yield x
176
+ except ValueError as e:
177
+ print("Caught ValueError:", e)
178
+ ret = {
179
+ "text": server_error_msg,
180
+ "error_code": 1,
181
+ }
182
+ yield json.dumps(ret).encode() + b"\0"
183
+ except Exception as e:
184
+ print("Caught Unknown Error", e)
185
+ ret = {
186
+ "text": server_error_msg,
187
+ "error_code": 1,
188
+ }
189
+ yield json.dumps(ret).encode() + b"\0"
190
+
191
+
192
+ app = FastAPI()
193
+
194
+
195
+ def release_model_semaphore(fn=None):
196
+ model_semaphore.release()
197
+ if fn is not None:
198
+ fn()
199
+
200
+
201
+ @app.post("/worker_generate_stream")
202
+ async def generate_stream(request: Request):
203
+ global model_semaphore, global_counter
204
+ global_counter += 1
205
+ params = await request.json()
206
+
207
+ if model_semaphore is None:
208
+ model_semaphore = asyncio.Semaphore(args.limit_model_concurrency)
209
+ await model_semaphore.acquire()
210
+ worker.send_heart_beat()
211
+ generator = worker.generate_stream_gate(params)
212
+ background_tasks = BackgroundTasks()
213
+ background_tasks.add_task(partial(release_model_semaphore, fn=worker.send_heart_beat))
214
+ return StreamingResponse(generator, background=background_tasks)
215
+
216
+
217
+ @app.post("/worker_get_status")
218
+ async def get_status(request: Request):
219
+ return worker.get_status()
220
+
221
+
222
+ if __name__ == "__main__":
223
+ parser = argparse.ArgumentParser()
224
+ parser.add_argument("--host", type=str, default="localhost")
225
+ parser.add_argument("--port", type=int, default=21002)
226
+ parser.add_argument("--worker-address", type=str,
227
+ default="http://localhost:21002")
228
+ parser.add_argument("--controller-address", type=str,
229
+ default="http://localhost:21001")
230
+ parser.add_argument("--model-name", type=str)
231
+ parser.add_argument("--sgl-endpoint", type=str)
232
+ parser.add_argument("--limit-model-concurrency", type=int, default=5)
233
+ parser.add_argument("--stream-interval", type=int, default=1)
234
+ parser.add_argument("--no-register", action="store_true")
235
+ args = parser.parse_args()
236
+ logger.info(f"args: {args}")
237
+
238
+ worker = ModelWorker(args.controller_address,
239
+ args.worker_address,
240
+ args.sgl_endpoint,
241
+ worker_id,
242
+ args.no_register,
243
+ args.model_name)
244
+ uvicorn.run(app, host=args.host, port=args.port, log_level="info")
videollama2/serve/test_message.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+
4
+ import requests
5
+
6
+ from llava.conversation import default_conversation
7
+
8
+
9
+ def main():
10
+ if args.worker_address:
11
+ worker_addr = args.worker_address
12
+ else:
13
+ controller_addr = args.controller_address
14
+ ret = requests.post(controller_addr + "/refresh_all_workers")
15
+ ret = requests.post(controller_addr + "/list_models")
16
+ models = ret.json()["models"]
17
+ models.sort()
18
+ print(f"Models: {models}")
19
+
20
+ ret = requests.post(controller_addr + "/get_worker_address",
21
+ json={"model": args.model_name})
22
+ worker_addr = ret.json()["address"]
23
+ print(f"worker_addr: {worker_addr}")
24
+
25
+ if worker_addr == "":
26
+ return
27
+
28
+ conv = default_conversation.copy()
29
+ conv.append_message(conv.roles[0], args.message)
30
+ prompt = conv.get_prompt()
31
+
32
+ headers = {"User-Agent": "LLaVA Client"}
33
+ pload = {
34
+ "model": args.model_name,
35
+ "prompt": prompt,
36
+ "max_new_tokens": args.max_new_tokens,
37
+ "temperature": 0.7,
38
+ "stop": conv.sep,
39
+ }
40
+ response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41
+ json=pload, stream=True)
42
+
43
+ print(prompt.replace(conv.sep, "\n"), end="")
44
+ for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45
+ if chunk:
46
+ data = json.loads(chunk.decode("utf-8"))
47
+ output = data["text"].split(conv.sep)[-1]
48
+ print(output, end="\r")
49
+ print("")
50
+
51
+
52
+ if __name__ == "__main__":
53
+ parser = argparse.ArgumentParser()
54
+ parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55
+ parser.add_argument("--worker-address", type=str)
56
+ parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57
+ parser.add_argument("--max-new-tokens", type=int, default=32)
58
+ parser.add_argument("--message", type=str, default=
59
+ "Tell me a story with more than 1000 words.")
60
+ args = parser.parse_args()
61
+
62
+ main()
videollama2/train.py ADDED
@@ -0,0 +1,585 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Adopted from https://github.com/haotian-liu/LLaVA. Below is the original copyright:
2
+ # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
3
+ # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
4
+ # Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
+ import re
19
+ import os
20
+ import copy
21
+ import json
22
+ import random
23
+ import pathlib
24
+ import traceback
25
+ from dataclasses import dataclass, field
26
+ from typing import Dict, Optional, Sequence, List
27
+
28
+ # torch-related packages
29
+ # NOTE: torch must be imported before transformers. Otherwise, `Segmentation fault (core dumped)` will occur.
30
+ import torch
31
+ from torch.utils.data import Dataset
32
+
33
+ import transformers
34
+ from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
35
+
36
+ import sys
37
+ sys.path.append('./')
38
+ from videollama2.model import *
39
+ from videollama2.constants import NUM_FRAMES, IGNORE_INDEX, MODAL_INDEX_MAP
40
+ from videollama2.mm_utils import tokenizer_multimodal_token, process_video, process_image
41
+ from videollama2.videollama2_trainer import (VideoLLaMA2Trainer,
42
+ get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3,
43
+ find_all_linear_names, safe_save_model_for_hf_trainer
44
+ )
45
+
46
+ # NOTE: fast tokenizer warning issue: https://github.com/huggingface/transformers/issues/5486
47
+ os.environ["TOKENIZERS_PARALLELISM"] = "true"
48
+
49
+ local_rank = None
50
+
51
+
52
+ def rank0_print(*args):
53
+ if local_rank == 0:
54
+ print(*args)
55
+
56
+
57
+ def set_seed(seed=42):
58
+ """
59
+ Set the random seed for reproducible results.
60
+
61
+ :param seed: An integer value to be used as the random seed.
62
+ """
63
+ torch.manual_seed(seed)
64
+ torch.cuda.manual_seed(seed)
65
+ torch.cuda.manual_seed_all(seed) # for multi-GPU setups
66
+ torch.backends.cudnn.deterministic = True
67
+ torch.backends.cudnn.benchmark = False
68
+
69
+
70
+ @dataclass
71
+ class ModelArguments:
72
+ # LLM Arguments
73
+ model_type: Optional[str] = field(default="videollama2", metadata={"help": "Model type selected in the list: " + ", ".join(VLLMs.keys())})
74
+ model_path: Optional[str] = field(default="lmsys/vicuna-7b-v1.5")
75
+ version: Optional[str] = field(default="v1", metadata={"help": "Version of the conversation template."})
76
+ freeze_backbone: bool = field(default=False, metadata={"help": "Whether to freeze the LLM backbone."})
77
+ # Connector Arguments
78
+ mm_projector_type: Optional[str] = field(default='linear')
79
+ tune_mm_mlp_adapter: bool = field(default=False)
80
+ pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
81
+ # Vision tower Arguments
82
+ vision_tower: Optional[str] = field(default=None)
83
+ mm_vision_select_layer: Optional[int] = field(default=-1)
84
+ mm_vision_select_feature: Optional[str] = field(default="patch")
85
+
86
+
87
+ @dataclass
88
+ class DataArguments:
89
+ # Path Arguments
90
+ data_path: str = field(default=None, metadata={"help": "Path to the training data."})
91
+ # image_folder: Optional[str] = field(default=None)
92
+ # video_folder: Optional[str] = field(default=None)
93
+ data_folder: Optional[str] = field(default=None)
94
+ # Loading Arguments
95
+ is_multimodal: bool = False
96
+ lazy_preprocess: bool = False
97
+ num_frames: Optional[int] = field(default=None)
98
+ # Preprocess Arguments
99
+ image_aspect_ratio: str = 'square'
100
+
101
+
102
+ @dataclass
103
+ class TrainingArguments(transformers.TrainingArguments):
104
+ optim: str = field(default="adamw_torch")
105
+ mm_projector_lr: Optional[float] = None
106
+ freeze_mm_mlp_adapter: bool = field(default=False)
107
+ remove_unused_columns: bool = field(default=False)
108
+ cache_dir: Optional[str] = field(default=None)
109
+ # Training Data Arguments
110
+ group_by_modality_length: bool = field(default=False)
111
+ model_max_length: int = field(
112
+ default=512,
113
+ metadata={
114
+ "help":
115
+ "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
116
+ },
117
+ )
118
+ # Lora or Quant Arguments
119
+ double_quant: bool = field(
120
+ default=True,
121
+ metadata={"help": "Compress the quantization statistics through double quantization."}
122
+ )
123
+ quant_type: str = field(
124
+ default="nf4",
125
+ metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
126
+ )
127
+ bits: int = field(
128
+ default=16,
129
+ metadata={"help": "How many bits to use."}
130
+ )
131
+ lora_enable: bool = False
132
+ lora_r: int = 64
133
+ lora_alpha: int = 16
134
+ lora_dropout: float = 0.05
135
+ lora_weight_path: str = ""
136
+ lora_bias: str = "none"
137
+
138
+
139
+ def preprocess_plain(
140
+ sources: Sequence[str],
141
+ tokenizer: transformers.PreTrainedTokenizer,
142
+ modal_token: str = None,
143
+ ) -> Dict:
144
+ roles = {"human": "user", "gpt": "assistant"}
145
+ conversations = []
146
+ input_ids = []
147
+ targets = []
148
+ for source in sources:
149
+ # 1. apply chat template for input conversation
150
+ assert len(source) == 2
151
+ assert modal_token in source[0]['value']
152
+ message = [
153
+ {'role': 'user', 'content': modal_token},
154
+ {'role': 'assistant', 'content': source[1]['value']}
155
+ ]
156
+ conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
157
+ # 2. tokenize conversations
158
+ input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
159
+ # 3. make targets
160
+ targets.append(copy.deepcopy(input_ids[-1]))
161
+ instruction = tokenizer.apply_chat_template(message[:1], tokenize=False, add_generation_prompt=True)
162
+ instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
163
+ targets[-1][:instruction_len] = IGNORE_INDEX
164
+
165
+ # print("instruction: ----------------")
166
+ # print(instruction)
167
+ # print("conversation: ----------------")
168
+ # print(conversation)
169
+ # print("training targets: ----------------")
170
+ # print(tokenizer.decode(targets[-1][instruction_len:]))
171
+ # print(input_ids[-1])
172
+ # print(targets[-1])
173
+
174
+ return dict(input_ids=input_ids, labels=targets)
175
+
176
+
177
+ def preprocess(
178
+ sources: Sequence[str],
179
+ tokenizer: transformers.PreTrainedTokenizer,
180
+ modal_token: str = None,
181
+ ) -> Dict:
182
+ roles = {"human": "user", "gpt": "assistant"}
183
+
184
+ # Apply prompt templates
185
+ conversations = []
186
+ input_ids = []
187
+ targets = []
188
+ for i, source in enumerate(sources):
189
+ if roles[source[0]["from"]] != "user":
190
+ # Skip the first one if it is not from human
191
+ source = source[1:]
192
+
193
+ message = [{'role': roles[sentence['from']], 'content': sentence['value']} for sentence in source]
194
+ conversation = tokenizer.apply_chat_template(message, tokenize=False, add_generation_prompt=False)
195
+ input_ids.append(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
196
+ targets.append(copy.deepcopy(input_ids[-1]))
197
+
198
+ assert len(source) % 2 == 0, f"Invalid conversation length {len(source)}."
199
+
200
+ cur = 0
201
+ message = []
202
+ for idx, sentence in enumerate(source):
203
+ if idx % 2 == 1:
204
+ tmp_message = [
205
+ {'role': roles[source[idx-1]['from']], 'content': source[idx-1]['value']},
206
+ {'role': roles[sentence['from']], 'content': sentence['value']}
207
+ ]
208
+
209
+ instruction = tokenizer.apply_chat_template(message + tmp_message[:1], tokenize=False, add_generation_prompt=True)
210
+ conversation = tokenizer.apply_chat_template(message + tmp_message, tokenize=False, add_generation_prompt=False)
211
+
212
+ instruction_len = len(tokenizer_multimodal_token(instruction, tokenizer, modal_token, return_tensors='pt'))
213
+ conversation_len = len(tokenizer_multimodal_token(conversation, tokenizer, modal_token, return_tensors='pt'))
214
+
215
+ targets[-1][cur:instruction_len] = IGNORE_INDEX
216
+
217
+ cur = conversation_len
218
+ message += tmp_message
219
+
220
+ return dict(input_ids=input_ids, labels=targets)
221
+
222
+
223
+ def preprocess_multimodal(
224
+ sources: Sequence[str],
225
+ data_args: DataArguments,
226
+ modal_token: str = None,
227
+ ) -> Dict:
228
+ is_multimodal = data_args.is_multimodal
229
+ if not is_multimodal:
230
+ return sources
231
+
232
+ assert modal_token in MODAL_INDEX_MAP, f"Unsupported modal token {modal_token}."
233
+
234
+ for source in sources:
235
+ for sentence in source:
236
+ if modal_token in sentence['value']:
237
+ sentence['value'] = sentence['value'].replace(modal_token, '').strip()
238
+ sentence['value'] = modal_token + '\n' + sentence['value']
239
+ sentence['value'] = sentence['value'].strip()
240
+ replace_token = modal_token
241
+ # TODO: fix this for multimedia, e.g., <video>, <audio>, etc.
242
+ sentence["value"] = sentence["value"].replace(modal_token, replace_token)
243
+
244
+ return sources
245
+
246
+
247
+ class LazySupervisedDataset(Dataset):
248
+ """Dataset for supervised fine-tuning."""
249
+
250
+ def __init__(self, data_path: str,
251
+ tokenizer: transformers.PreTrainedTokenizer,
252
+ data_args: DataArguments):
253
+ super(LazySupervisedDataset, self).__init__()
254
+ list_data_dict = json.load(open(data_path, "r"))
255
+
256
+ rank0_print("Formatting inputs...Skip in lazy mode")
257
+ self.tokenizer = tokenizer
258
+ self.list_data_dict = list_data_dict
259
+ self.data_args = data_args
260
+
261
+ def __len__(self):
262
+ return len(self.list_data_dict)
263
+
264
+ @property
265
+ def lengths(self):
266
+ length_list = []
267
+ for sample in self.list_data_dict:
268
+ img_tokens = 576 if 'image' in sample else 0
269
+ length_list.append(sum(len(conv['value'].split()) for conv in sample['conversations']) + img_tokens)
270
+ return length_list
271
+
272
+ @property
273
+ def modality_lengths(self):
274
+ length_list = []
275
+ for sample in self.list_data_dict:
276
+ cur_len = sum(len(conv['value'].split()) for conv in sample['conversations'])
277
+ cur_len = cur_len if 'image' in sample else -cur_len
278
+ length_list.append(cur_len)
279
+ return length_list
280
+
281
+ def __getitem__(self, i) -> Dict[str, torch.Tensor]:
282
+ sources = self.list_data_dict[i]
283
+ if isinstance(i, int):
284
+ sources = [sources]
285
+ assert len(sources) == 1, "Don't know why it is wrapped to a list" # FIXME
286
+
287
+ image_processor = self.data_args.image_processor
288
+ video_processor = self.data_args.video_processor
289
+
290
+ num_frames = NUM_FRAMES if self.data_args.num_frames is None else self.data_args.num_frames
291
+
292
+ if 'image' in sources[0]:
293
+ image_file = self.list_data_dict[i]['image']
294
+ image_folder = self.data_args.data_folder
295
+ image_file = os.path.join(image_folder, image_file)
296
+
297
+ try:
298
+ image = process_image(image_file, image_processor, aspect_ratio=self.data_args.image_aspect_ratio)
299
+ except:
300
+ traceback.print_exc()
301
+ backup_idx = random.randint(0, len(self.list_data_dict) - 1)
302
+ print(f"Encounted error when reading image {image_file}, use {backup_idx}-th example instead!!!")
303
+ return self.__getitem__(backup_idx)
304
+
305
+ # place <image> tag to question head.
306
+ modal_token = "<image>"
307
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
308
+ elif 'video' in sources[0]:
309
+ video_file = self.list_data_dict[i]['video']
310
+ video_folder = self.data_args.data_folder
311
+ video_file = os.path.join(video_folder, video_file)
312
+
313
+ try:
314
+ video = process_video(video_file, video_processor, aspect_ratio=self.data_args.image_aspect_ratio, num_frames=num_frames)
315
+ except Exception as e:
316
+ traceback.print_exc()
317
+ backup_idx = random.randint(0, len(self.list_data_dict) - 1)
318
+ print(f"Encounted error when reading video {video_file}, use {backup_idx}-th example instead!!!")
319
+ return self.__getitem__(backup_idx)
320
+
321
+ # place <video> tag to question head.
322
+ modal_token = "<video>"
323
+ sources = preprocess_multimodal(copy.deepcopy([e["conversations"] for e in sources]), self.data_args, modal_token)
324
+ else:
325
+ modal_token = None
326
+ sources = copy.deepcopy([e["conversations"] for e in sources])
327
+
328
+ if self.data_args.is_pretraining:
329
+ data_dict = preprocess_plain(sources, self.tokenizer, modal_token=modal_token)
330
+ else:
331
+ data_dict = preprocess(sources, self.tokenizer, modal_token=modal_token)
332
+
333
+ if isinstance(i, int):
334
+ data_dict = dict(input_ids=data_dict["input_ids"][0], labels=data_dict["labels"][0])
335
+
336
+ # image exist in the data
337
+ if 'image' in self.list_data_dict[i]:
338
+ data_dict['image'] = image
339
+ elif 'video' in self.list_data_dict[i]:
340
+ data_dict['video'] = video
341
+ elif self.data_args.is_multimodal:
342
+ # image does not exist in the data, but the model is multimodal
343
+ data_dict['image'] = torch.zeros(3, self.data_args.image_size, self.data_args.image_size)
344
+ return data_dict
345
+
346
+
347
+ @dataclass
348
+ class DataCollatorForSupervisedDataset(object):
349
+ """Collate examples for supervised fine-tuning."""
350
+
351
+ tokenizer: transformers.PreTrainedTokenizer
352
+
353
+ def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
354
+ input_ids, labels = tuple([instance[key] for instance in instances]
355
+ for key in ("input_ids", "labels"))
356
+ input_ids = torch.nn.utils.rnn.pad_sequence(
357
+ input_ids,
358
+ batch_first=True,
359
+ padding_value=self.tokenizer.pad_token_id)
360
+ labels = torch.nn.utils.rnn.pad_sequence(labels,
361
+ batch_first=True,
362
+ padding_value=IGNORE_INDEX)
363
+ input_ids = input_ids[:, :self.tokenizer.model_max_length]
364
+ labels = labels[:, :self.tokenizer.model_max_length]
365
+ batch = dict(
366
+ input_ids=input_ids,
367
+ labels=labels,
368
+ attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
369
+ )
370
+
371
+ # work for 'images' argument in `prepare_inputs_labels_for_multimodal` of LlavaMetaForCausalLM in llava_arch.py
372
+ batch['images'] = []
373
+ for instance in instances:
374
+ for modal_token in MODAL_INDEX_MAP.keys():
375
+ modal_token = modal_token.lower()
376
+ # MODAL_TOKEN shape like: <image>, <video>, ...
377
+ modal_name = re.findall(f'[<](.*)[>]', modal_token)
378
+ assert len(modal_name) == 1
379
+ modal_name = modal_name[0]
380
+ if modal_name in instance:
381
+ batch['images'].append((instance[modal_name], modal_name))
382
+
383
+ return batch
384
+
385
+
386
+ def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
387
+ data_args) -> Dict:
388
+ """Make dataset and collator for supervised fine-tuning."""
389
+ train_dataset = LazySupervisedDataset(
390
+ tokenizer=tokenizer,
391
+ data_path=data_args.data_path,
392
+ data_args=data_args
393
+ )
394
+ data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
395
+ return dict(train_dataset=train_dataset,
396
+ eval_dataset=None,
397
+ data_collator=data_collator)
398
+
399
+
400
+ def train(attn_implementation=None):
401
+ global local_rank
402
+ set_seed(42)
403
+
404
+ parser = transformers.HfArgumentParser((ModelArguments, DataArguments, TrainingArguments))
405
+ model_args, data_args, training_args = parser.parse_args_into_dataclasses()
406
+
407
+ local_rank = training_args.local_rank
408
+ compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
409
+
410
+ bnb_model_from_pretrained_args = {}
411
+ if training_args.bits in [4, 8]:
412
+ from transformers import BitsAndBytesConfig
413
+ bnb_model_from_pretrained_args.update(dict(
414
+ # device_map={"": training_args.device},
415
+ # BUG: High version transformers report error:
416
+ # ValueError: You can't pass `load_in_4bit`or `load_in_8bit` as a kwarg when passing `quantization_config` argument at the same time
417
+ # load_in_4bit=training_args.bits == 4,
418
+ # load_in_8bit=training_args.bits == 8,
419
+ quantization_config=BitsAndBytesConfig(
420
+ load_in_4bit=training_args.bits == 4,
421
+ load_in_8bit=training_args.bits == 8,
422
+ llm_int8_skip_modules=["mm_projector"],
423
+ llm_int8_threshold=6.0,
424
+ llm_int8_has_fp16_weight=False,
425
+ bnb_4bit_compute_dtype=compute_dtype,
426
+ bnb_4bit_use_double_quant=training_args.double_quant,
427
+ bnb_4bit_quant_type=training_args.quant_type, # {'fp4', 'nf4'}
428
+ bnb_4bit_quant_storage=compute_dtype,
429
+ )
430
+ ))
431
+
432
+ config = VLLMConfigs[model_args.model_type].from_pretrained(model_args.model_path, trust_remote_code=True)
433
+ if 'gemma2' in model_args.model_type:
434
+ config._attn_implementation = 'eager'
435
+ else:
436
+ config._attn_implementation = attn_implementation
437
+
438
+ if model_args.vision_tower is not None:
439
+ model = VLLMs[model_args.model_type].from_pretrained(
440
+ model_args.model_path,
441
+ config=config,
442
+ cache_dir=training_args.cache_dir,
443
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
444
+ do_sample=True,
445
+ **bnb_model_from_pretrained_args
446
+ )
447
+ if 'mixtral' in model_args.model_type:
448
+ import deepspeed
449
+ deepspeed.utils.set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
450
+ else:
451
+ model = transformers.LlamaForCausalLM.from_pretrained(
452
+ model_args.model_path,
453
+ config=config,
454
+ cache_dir=training_args.cache_dir,
455
+ torch_dtype=(torch.bfloat16 if training_args.bf16 else None),
456
+ do_sample=True,
457
+ **bnb_model_from_pretrained_args
458
+ )
459
+ model.config.use_cache = False
460
+
461
+ if model_args.freeze_backbone:
462
+ model.model.requires_grad_(False)
463
+
464
+ if training_args.bits in [4, 8]:
465
+ from peft import prepare_model_for_kbit_training
466
+ model.config.torch_dtype=(torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
467
+ model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
468
+
469
+ if training_args.gradient_checkpointing:
470
+ if hasattr(model, "enable_input_require_grads"):
471
+ model.enable_input_require_grads()
472
+ else:
473
+ def make_inputs_require_grad(module, input, output):
474
+ output.requires_grad_(True)
475
+ model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
476
+
477
+ if training_args.lora_enable:
478
+ from peft import LoraConfig, get_peft_model
479
+ lora_config = LoraConfig(
480
+ r=training_args.lora_r,
481
+ lora_alpha=training_args.lora_alpha,
482
+ target_modules=find_all_linear_names(model),
483
+ lora_dropout=training_args.lora_dropout,
484
+ bias=training_args.lora_bias,
485
+ task_type="CAUSAL_LM",
486
+ )
487
+ if training_args.bits == 16:
488
+ if training_args.bf16:
489
+ model.to(torch.bfloat16)
490
+ if training_args.fp16:
491
+ model.to(torch.float16)
492
+ rank0_print("Adding LoRA adapters...")
493
+ model = get_peft_model(model, lora_config)
494
+
495
+
496
+ tokenizer = transformers.AutoTokenizer.from_pretrained(
497
+ model_args.model_path,
498
+ cache_dir=training_args.cache_dir,
499
+ model_max_length=training_args.model_max_length,
500
+ padding_side="right",
501
+ use_fast=True,
502
+ )
503
+
504
+ if tokenizer.pad_token is None:
505
+ tokenizer.pad_token = tokenizer.unk_token
506
+
507
+ if model_args.vision_tower is not None:
508
+ # initialize vision encoder + multi-modal projector
509
+ model.get_model().initialize_vision_modules(model_args=model_args, fsdp=training_args.fsdp)
510
+
511
+ vision_tower = model.get_vision_tower()
512
+ vision_tower.to(dtype=torch.bfloat16 if training_args.bf16 else torch.float16, device=training_args.device)
513
+
514
+ data_args.image_size = vision_tower.image_size
515
+
516
+ data_args.image_processor = vision_tower.image_processor
517
+ data_args.video_processor = vision_tower.video_processor if hasattr(vision_tower, "video_processor") else vision_tower.image_processor
518
+
519
+ data_args.is_multimodal = True
520
+
521
+ model.config.image_aspect_ratio = data_args.image_aspect_ratio
522
+ model.config.tokenizer_padding_side = tokenizer.padding_side
523
+ model.config.tokenizer_model_max_length = tokenizer.model_max_length
524
+
525
+ model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
526
+ if model_args.tune_mm_mlp_adapter:
527
+ model.requires_grad_(False)
528
+ for p in model.get_model().mm_projector.parameters():
529
+ p.requires_grad = True
530
+
531
+ if model_args.tune_mm_mlp_adapter:
532
+ data_args.is_pretraining = True
533
+ else:
534
+ data_args.is_pretraining = False
535
+
536
+ model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
537
+ if training_args.freeze_mm_mlp_adapter:
538
+ for p in model.get_model().mm_projector.parameters():
539
+ p.requires_grad = False
540
+
541
+ if training_args.bits in [4, 8]:
542
+ model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
543
+
544
+ model.config.mm_projector_lr = training_args.mm_projector_lr
545
+ model.config.num_frames = NUM_FRAMES if data_args.num_frames is None else data_args.num_frames
546
+
547
+ if training_args.bits in [4, 8]:
548
+ from peft.tuners.lora import LoraLayer
549
+ for name, module in model.named_modules():
550
+ if isinstance(module, LoraLayer):
551
+ if training_args.bf16:
552
+ module = module.to(torch.bfloat16)
553
+ if 'norm' in name:
554
+ module = module.to(torch.float32)
555
+ if 'lm_head' in name or 'embed_tokens' in name:
556
+ if hasattr(module, 'weight'):
557
+ if training_args.bf16 and module.weight.dtype == torch.float32:
558
+ module = module.to(torch.bfloat16)
559
+
560
+ print("Current model:", model)
561
+ data_module = make_supervised_data_module(tokenizer=tokenizer, data_args=data_args)
562
+ # select a Trainer
563
+ trainer = VideoLLaMA2Trainer(model=model, tokenizer=tokenizer, args=training_args, **data_module)
564
+
565
+ if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
566
+ trainer.train(resume_from_checkpoint=True)
567
+ else:
568
+ trainer.train()
569
+ trainer.save_state()
570
+
571
+ model.config.use_cache = True
572
+
573
+ if training_args.lora_enable:
574
+ state_dict = get_peft_state_maybe_zero_3(model.named_parameters(), training_args.lora_bias)
575
+ non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(model.named_parameters())
576
+ if training_args.local_rank == 0 or training_args.local_rank == -1:
577
+ model.config.save_pretrained(training_args.output_dir)
578
+ model.save_pretrained(training_args.output_dir, state_dict=state_dict)
579
+ torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
580
+ else:
581
+ safe_save_model_for_hf_trainer(trainer=trainer, output_dir=training_args.output_dir)
582
+
583
+
584
+ if __name__ == "__main__":
585
+ train()