import torch from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer # Load the pre-trained model and tokenizer model_name = "Crystalcareai/Quiet-Star-Custom" model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, ignore_mismatched_sizes=True) tokenizer = AutoTokenizer.from_pretrained(model_name) # Set the tokenizer in the model model.tokenizer = tokenizer prompt_template = "[INST] {prompt} [/INST]" prompt = "This is a reasoning problem. You're standing on the surface of the Earth. " \ "You walk one mile south, one mile west and one mile north. " \ "You end up exactly where you started. Where are EXACTLY on earth you?" input_text = prompt input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device) attention_mask = torch.ones_like(input_ids).to(device) streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) tokens = tokenizer( prompt_template.format(prompt=prompt), return_tensors='pt' ).input_ids.cuda() # Generate the output using the generate method with torch.no_grad(): generated_outputs = model.generate( input_ids=input_ids, attention_mask=attention_mask, max_length=1024, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True, use_cache=True, num_beams=1, temperature=0.2, repetition_penalty=1.2, length_penalty=1.0, pad_token_id=tokenizer.eos_token_id, eos_token_id=tokenizer.eos_token_id, output_attentions=False, output_hidden_states=False, return_dict_in_generate=True, streamer=streamer, ) # Decode the generated output generated_text = tokenizer.decode(generated_outputs.sequences[0], skip_special_tokens=True) # Print the generated output print("Generated output:") print(generated_text)