Video-Text-to-Text
Transformers
Safetensors
English
llava
text-generation
multimodal
Eval Results
Inference Endpoints
mfarre HF staff commited on
Commit
e0b7f7a
·
verified ·
1 Parent(s): c0a6bab

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +3 -3
README.md CHANGED
@@ -170,7 +170,7 @@ import warnings
170
  from decord import VideoReader, cpu
171
  import numpy as np
172
  warnings.filterwarnings("ignore")
173
- def load_video(self, video_path, max_frames_num,fps=1,force_sample=False):
174
  if max_frames_num == 0:
175
  return np.zeros((1, 336, 336, 3))
176
  vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
@@ -195,9 +195,9 @@ device_map = "auto"
195
  tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
196
  model.eval()
197
  video_path = "XXXX"
198
- max_frames_num = "64"
199
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
200
- video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().bfloat16()
201
  video = [video]
202
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
203
  time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."
 
170
  from decord import VideoReader, cpu
171
  import numpy as np
172
  warnings.filterwarnings("ignore")
173
+ def load_video(video_path, max_frames_num,fps=1,force_sample=False):
174
  if max_frames_num == 0:
175
  return np.zeros((1, 336, 336, 3))
176
  vr = VideoReader(video_path, ctx=cpu(0),num_threads=1)
 
195
  tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, torch_dtype="bfloat16", device_map=device_map) # Add any other thing you want to pass in llava_model_args
196
  model.eval()
197
  video_path = "XXXX"
198
+ max_frames_num = 64
199
  video,frame_time,video_time = load_video(video_path, max_frames_num, 1, force_sample=True)
200
+ video = image_processor.preprocess(video, return_tensors="pt")["pixel_values"].cuda().half()
201
  video = [video]
202
  conv_template = "qwen_1_5" # Make sure you use correct chat template for different models
203
  time_instruciton = f"The video lasts for {video_time:.2f} seconds, and {len(video[0])} frames are uniformly sampled from it. These frames are located at {frame_time}.Please answer the following questions related to this video."