Text Generation
Transformers
Safetensors
llama
text-generation-inference
Inference Endpoints
mfromm commited on
Commit
78ae529
·
verified ·
1 Parent(s): 6fb58c2

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -7
README.md CHANGED
@@ -89,14 +89,36 @@ prompt = f"System: {system_messages[lang_code]}\nUser: {user}\nAssistant:<s>"
89
  import torch
90
  from transformers import AutoModelForCausalLM, AutoTokenizer
91
 
92
- model_name = "openGPT-X/Teuken-7B-instruct-v0.4"
93
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
- tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
95
- model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
96
- inputs = tokenizer(prompt, return_tensors="pt")
97
- inputs = {k: v.to(device) for k, v in inputs.items()} # Move inputs to the same device as the model
98
- output = model.generate(input_ids=inputs['input_ids'], max_new_tokens=1000, do_sample=True)
99
- result = tokenizer.decode(output.tolist())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  ```
101
 
102
  This example demonstrates how to load the model and tokenizer, prepare input, generate text, and print the result.
 
89
  import torch
90
  from transformers import AutoModelForCausalLM, AutoTokenizer
91
 
92
+
93
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
94
+
95
+ model_name = "openGPT-X/Teuken-7B-instruct-v0.4"
96
+ model = AutoModelForCausalLM.from_pretrained(
97
+ model_name,
98
+ trust_remote_code=True,
99
+ torch_dtype=torch.bfloat16,
100
+ attn_implementation="flash_attention_2",
101
+ )
102
+ model = model.to(device).eval()
103
+ tokenizer = AutoTokenizer.from_pretrained(
104
+ model_name,
105
+ use_fast=False,
106
+ trust_remote_code=True,
107
+ )
108
+
109
+ messages = [{"role": "User", "content": "Wer bist du?"}]
110
+ prompt_ids = tokenizer.apply_chat_template(messages, chat_template="DE", tokenize=True, add_generation_prompt=True, return_tensors="pt")
111
+ prediction = model.generate(
112
+ prompt_ids.to(model.device),
113
+ max_length=512,
114
+ do_sample=True,
115
+ top_k=50,
116
+ top_p=0.95,
117
+ temperature=0.7,
118
+ num_return_sequences=1,
119
+ )
120
+ prediction_text = tokenizer.decode(prediction[0])
121
+ print(prediction_text)
122
  ```
123
 
124
  This example demonstrates how to load the model and tokenizer, prepare input, generate text, and print the result.