import torch from transformers import AutoModelForCausalLM, AutoTokenizer from transformers import CLIPProcessor, CLIPModel device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model_name = "microsoft/Phi-3.5-mini-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # Load the model and processor clipmodel = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") clipprocessor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") model_location = "./checkpoints" base_phi_model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, ).to(device) # Whisper model configuration whisper_model_name = "openai/whisper-small"