import gc import torch from tqdm import tqdm from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM, AutoConfig model_path = "Crystalcareai/Quiet-Star-Custom" # Load model config = AutoConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_path, config=config, device_map="auto", low_cpu_mem_usage=True, torch_dtype=torch.bfloat16, trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained(model_path) model.tokenizer = tokenizer # Assign the tokenizer to the model instance streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False) # Convert prompt to tokens prompt_template = "[INST] {prompt} [/INST]" prompt = "You're standing on the surface of the Earth. "\ "You walk one mile south, one mile west and one mile north. "\ "You end up exactly where you started. Where are you?" input_ids = tokenizer( prompt_template.format(prompt=prompt), return_tensors='pt' ).input_ids.cuda() # Generate output generation_output = model.generate( input_ids, max_length=1024, do_sample=True, top_k=50, top_p=0.95, num_return_sequences=1, streamer=streamer, ) # Decode the output generated_text = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0] print(generated_text)