marks commited on
Commit
1a61dc5
·
1 Parent(s): bb9a4a3

Added Docker file and chat

Browse files
Files changed (2) hide show
  1. Dockerfile +4 -0
  2. chat.py +33 -0
Dockerfile ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ FROM nvidia/cuda:12.6.2-cudnn-devel-ubuntu22.04
2
+ RUN wget --header="Authorization: Bearer YOUR_HF_TOKEN" https://huggingface.co/nvidia/Hymba-1.5B-Base/resolve/main/setup.sh
3
+ RUN bash setup.sh
4
+ CMD [ "python", "chat.py" ]
chat.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer, StopStringCriteria, StoppingCriteriaList
2
+ import torch
3
+
4
+ # Load the tokenizer and model
5
+ repo_name = "nvidia/Hymba-1.5B-Instruct"
6
+
7
+ tokenizer = AutoTokenizer.from_pretrained(repo_name, trust_remote_code=True)
8
+ model = AutoModelForCausalLM.from_pretrained(repo_name, trust_remote_code=True)
9
+ model = model.cuda().to(torch.bfloat16)
10
+
11
+ # Chat with Hymba
12
+ prompt = input()
13
+
14
+ messages = [
15
+ {"role": "system", "content": "You are a helpful assistant."}
16
+ ]
17
+ messages.append({"role": "user", "content": prompt})
18
+
19
+ # Apply chat template
20
+ tokenized_chat = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to('cuda')
21
+ stopping_criteria = StoppingCriteriaList([StopStringCriteria(tokenizer=tokenizer, stop_strings="</s>")])
22
+ outputs = model.generate(
23
+ tokenized_chat,
24
+ max_new_tokens=256,
25
+ do_sample=False,
26
+ temperature=0.7,
27
+ use_cache=True,
28
+ stopping_criteria=stopping_criteria
29
+ )
30
+ input_length = tokenized_chat.shape[1]
31
+ response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
32
+
33
+ print(f"Model response: {response}")