openGPT-X
/

Teuken-7B-instruct-research-v0.4

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

mfromm commited on Sep 25, 2024

Commit

78ae529

·

verified ·

1 Parent(s): 6fb58c2

Update README.md

Files changed (1) hide show

README.md +29 -7

README.md CHANGED Viewed

@@ -89,14 +89,36 @@ prompt = f"System: {system_messages[lang_code]}\nUser: {user}\nAssistant:<s>"
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-model_name = "openGPT-X/Teuken-7B-instruct-v0.4"
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.bfloat16).to(device)
-inputs = tokenizer(prompt, return_tensors="pt")
-inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to the same device as the model
-output = model.generate(input_ids=inputs['input_ids'], max_new_tokens=1000, do_sample=True)
-result = tokenizer.decode(output.tolist())
 ```
 This example demonstrates how to load the model and tokenizer, prepare input, generate text, and print the result.

 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model_name = "openGPT-X/Teuken-7B-instruct-v0.4"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+    attn_implementation="flash_attention_2",
+)
+model = model.to(device).eval()
+tokenizer = AutoTokenizer.from_pretrained(
+    model_name,
+    use_fast=False,
+    trust_remote_code=True,
+)
+messages = [{"role": "User", "content": "Wer bist du?"}]
+prompt_ids = tokenizer.apply_chat_template(messages, chat_template="DE", tokenize=True, add_generation_prompt=True, return_tensors="pt")
+prediction = model.generate(
+    prompt_ids.to(model.device),
+    max_length=512,
+    do_sample=True,
+    top_k=50,
+    top_p=0.95,
+    temperature=0.7,
+    num_return_sequences=1,
+)
+prediction_text = tokenizer.decode(prediction[0])
+print(prediction_text)
 ```
 This example demonstrates how to load the model and tokenizer, prepare input, generate text, and print the result.