from typing import Dict, Any import torch from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline from peft import PeftModel class EndpointHandler(): def __init__(self, path=""): # Load tokenizer self.tokenizer = AutoTokenizer.from_pretrained( "meta-llama/Llama-3.2-1B-Instruct", trust_remote_code=True ) # Load base model base_model = AutoModelForCausalLM.from_pretrained( "meta-llama/Llama-3.2-1B-Instruct", device_map="auto", trust_remote_code=True ) # Load LoRA adapter self.model = PeftModel.from_pretrained( base_model, "jplhughes2/llama-3.2-1b-af-lora-adapters", device_map="auto" ) # Create generation pipeline self.generator = pipeline( "text-generation", model=self.model, tokenizer=self.tokenizer ) def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: prompt = data.get("inputs", "") max_new_tokens = data.get("max_new_tokens", 128) temperature = data.get("temperature", 0.7) top_p = data.get("top_p", 0.9) outputs = self.generator( prompt, max_new_tokens=max_new_tokens, temperature=temperature, top_p=top_p, do_sample=True, return_full_text=False ) return outputs