import sys sys.path.append("./Llava1.5/LLaVA") # Detailed model can be viewed at https://github.com/haotian-liu/LLaVA from llava.model.builder import load_pretrained_model from llava.mm_utils import get_model_name_from_path from llava.eval.run_llava import eval_model model_path = "./pretrained_model/llava-v1.5-7b" # "liuhaotian/llava-v1.5-7b" prompt = "Please describe the people in the image, including their gender, age, clothing, facial expressions, and any other distinguishing features." image_file = "./demo.png" tokenizer, model, image_processor, context_len = load_pretrained_model( model_path=model_path, model_base=None, model_name=get_model_name_from_path(model_path), # load_4bit=True ) # device="cuda" args = type('Args', (), { "model_path": model_path, "model_base": None, "model_name": get_model_name_from_path(model_path), "query": prompt, "conv_mode": None, "image_file": image_file, "sep": ",", "temperature": 0, "top_p": None, "num_beams": 1, "max_new_tokens": 512 })() outputs = eval_model(args, tokenizer, model, image_processor) print(f"The caption is: {outputs}")