from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList
import torch

# Load the tokenizer and model
repo_name = "nvidia/Hymba-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
model = model.cuda().to(torch.bfloat16)

# Chat with Hymba
prompt = input()

messages = [
    {"role": "system", "content": "You are a helpful assistant."}
]
messages.append({"role": "user", "content": prompt})

# Apply chat template
tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="</s>")])
outputs = model.generate(
    tokenized_chat, 
    max_new_tokens=256,
    do_sample=False,
    temperature=0.7,
    use_cache=True,
    stopping_criteria=stopping_criteria
)
input_length = tokenized_chat.shape[1]
response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)

print(f"Model response: {response}")