allenai-OLMoE-1B-7B-0924

Runtime error

App Files Files Community

nisten commited on Sep 4, 2024

Commit

32720ee

verified ·

1 Parent(s): 565c0e9

Update app.py

Browse files

Files changed (1) hide show

app.py +21 -14

app.py CHANGED Viewed

@@ -10,7 +10,7 @@ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENT
 from transformers import OlmoeForCausalLM, AutoTokenizer
-model_name = "allenai/OLMoE-1B-7B-0924"
 # Wrap model loading in a try-except block to handle potential errors
 try:
@@ -22,7 +22,7 @@ try:
         low_cpu_mem_usage=True,
         device_map="auto",
         _attn_implementation="flash_attention_2"  # Enable Flash Attention 2
-    )
     model.gradient_checkpointing_enable()  # Enable gradient checkpointing
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
@@ -35,26 +35,33 @@ system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
                  "while always answering questions in full first principles analysis type of thinking "
                  "without using any analogies and always showing full working code or output in his answers.")
-chat_template = "<|system|>{system_message}<|end|><|user|>{user_message}<|end|><|assistant|>"
 @spaces.GPU
 def generate_response(message, history, temperature, max_new_tokens):
     if model is None or tokenizer is None:
         yield "Model or tokenizer not loaded properly. Please check the logs."
         return
-    full_prompt = chat_template.format(system_message=system_prompt, user_message=message)
-    inputs = tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
     try:
         with torch.no_grad():
-            streamer = tokenizer.stream(inputs.input_ids, model, temperature=temperature, max_new_tokens=max_new_tokens)
-            collected_tokens = []
-            for token in streamer:
-                collected_tokens.append(token)
-                partial_text = tokenizer.decode(collected_tokens, skip_special_tokens=True)
-                yield partial_text.strip()
     except RuntimeError as e:
         if "CUDA out of memory" in str(e):
             yield "GPU memory exceeded. Try reducing the max tokens or using a smaller model."
@@ -97,5 +104,5 @@ with gr.Blocks(css=css) as demo:
     clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
-    demo.queue(api_open=False, max_size=10)  # Limiting queue size
     demo.launch(debug=True, show_api=True, share=False)  # Disabled sharing for security

 from transformers import OlmoeForCausalLM, AutoTokenizer
+model_name = "allenai/OLMoE-1B-7B-0924-Instruct"
 # Wrap model loading in a try-except block to handle potential errors
 try:
         low_cpu_mem_usage=True,
         device_map="auto",
         _attn_implementation="flash_attention_2"  # Enable Flash Attention 2
+    ).to(DEVICE)
     model.gradient_checkpointing_enable()  # Enable gradient checkpointing
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
                  "while always answering questions in full first principles analysis type of thinking "
                  "without using any analogies and always showing full working code or output in his answers.")
 @spaces.GPU
 def generate_response(message, history, temperature, max_new_tokens):
     if model is None or tokenizer is None:
         yield "Model or tokenizer not loaded properly. Please check the logs."
         return
+    messages = [{"role": "system", "content": system_prompt}]
+    for user_msg, assistant_msg in history:
+        messages.append({"role": "user", "content": user_msg})
+        if assistant_msg:
+            messages.append({"role": "assistant", "content": assistant_msg})
+    messages.append({"role": "user", "content": message})
+    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(DEVICE)
     try:
         with torch.no_grad():
+            generated_ids = model.generate(
+                inputs,
+                max_new_tokens=max_new_tokens,
+                do_sample=True,
+                temperature=temperature,
+                eos_token_id=tokenizer.eos_token_id,
+            )
+        generated_text = tokenizer.decode(generated_ids[0, inputs.shape[1]:], skip_special_tokens=True)
+        yield generated_text.strip()
     except RuntimeError as e:
         if "CUDA out of memory" in str(e):
             yield "GPU memory exceeded. Try reducing the max tokens or using a smaller model."
     clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
+    demo.queue(api_open=True, max_size=10)  # Limiting queue size
     demo.launch(debug=True, show_api=True, share=False)  # Disabled sharing for security