import gc
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, TextStreamer, AutoModelForCausalLM, AutoConfig

model_path = "Crystalcareai/Quiet-Star-Custom"

# Load model
config = AutoConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    config=config,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)
model.tokenizer = tokenizer  # Assign the tokenizer to the model instance
streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

# Convert prompt to tokens
prompt_template = "[INST] {prompt} [/INST]"

prompt = "You're standing on the surface of the Earth. "\
        "You walk one mile south, one mile west and one mile north. "\
        "You end up exactly where you started. Where are you?"

input_ids = tokenizer(
    prompt_template.format(prompt=prompt),
    return_tensors='pt'
).input_ids.cuda()

# Generate output
generation_output = model.generate(
    input_ids,
    max_length=1024,
    do_sample=True,
    top_k=50,
    top_p=0.95,
    num_return_sequences=1,
    streamer=streamer,
)

# Decode the output
generated_text = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
print(generated_text)