allenai-OLMoE-1B-7B-0924

Runtime error

App Files Files Community

nisten commited on Sep 4, 2024

Commit

e9acdad

verified ·

1 Parent(s): e203e91

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -14

app.py CHANGED Viewed

@@ -5,8 +5,8 @@ import subprocess
 import sys
 # Install required packages
-subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-deps", "accelerate", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
-subprocess.run('pip install flash-attn --no-build-isolation --no-deps', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 from transformers import OlmoeForCausalLM, AutoTokenizer
@@ -18,11 +18,12 @@ try:
     model = OlmoeForCausalLM.from_pretrained(
         model_name,
         trust_remote_code=True,
-        torch_dtype=torch.bfloat16 if DEVICE == "cuda" else torch.float32,
         low_cpu_mem_usage=True,
         device_map="auto",
         _attn_implementation="flash_attention_2"  # Enable Flash Attention 2
     )
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
     print(f"Error loading model: {e}")
@@ -45,13 +46,22 @@ def generate_response(message, history, temperature, max_new_tokens):
     full_prompt = chat_template.format(system_message=system_prompt, user_message=message)
     inputs = tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
-    streamer = tokenizer.stream(inputs.input_ids, model, temperature=temperature, max_new_tokens=max_new_tokens)
-    collected_tokens = []
-    for token in streamer:
-        collected_tokens.append(token)
-        partial_text = tokenizer.decode(collected_tokens, skip_special_tokens=True)
-        yield partial_text.strip()
 css = """
   #output {
@@ -76,7 +86,7 @@ with gr.Blocks(css=css) as demo:
     def bot(history, temp, max_tokens):
         user_message = history[-1][0]
         bot_message = ""
-        for token in generate_response(user_message, history, temp, max_tokens):
             bot_message = token
             history[-1][1] = bot_message
             yield history
@@ -84,8 +94,8 @@ with gr.Blocks(css=css) as demo:
     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
         bot, [chatbot, temperature, max_new_tokens], chatbot
     )
-    clear.click(lambda: None, None, chatbot, queue=True)
 if __name__ == "__main__":
-    demo.queue(api_open=True)
-    demo.launch(debug=True, show_api=True)

 import sys
 # Install required packages
+subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", "--force-reinstall", "--no-deps", "einops" "accelerate", "git+https://github.com/Muennighoff/transformers.git@olmoe"])
+subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 from transformers import OlmoeForCausalLM, AutoTokenizer
     model = OlmoeForCausalLM.from_pretrained(
         model_name,
         trust_remote_code=True,
+        torch_dtype=torch.float16,  # Using float16 for lower precision
         low_cpu_mem_usage=True,
         device_map="auto",
         _attn_implementation="flash_attention_2"  # Enable Flash Attention 2
     )
+    model.gradient_checkpointing_enable()  # Enable gradient checkpointing
     tokenizer = AutoTokenizer.from_pretrained(model_name)
 except Exception as e:
     print(f"Error loading model: {e}")
     full_prompt = chat_template.format(system_message=system_prompt, user_message=message)
     inputs = tokenizer(full_prompt, return_tensors="pt").to(DEVICE)
+    try:
+        with torch.no_grad():
+            streamer = tokenizer.stream(inputs.input_ids, model, temperature=temperature, max_new_tokens=max_new_tokens)
+            collected_tokens = []
+            for token in streamer:
+                collected_tokens.append(token)
+                partial_text = tokenizer.decode(collected_tokens, skip_special_tokens=True)
+                yield partial_text.strip()
+    except RuntimeError as e:
+        if "CUDA out of memory" in str(e):
+            yield "GPU memory exceeded. Try reducing the max tokens or using a smaller model."
+        else:
+            yield f"An error occurred: {str(e)}"
+    except Exception as e:
+        yield f"An unexpected error occurred: {str(e)}"
 css = """
   #output {
     def bot(history, temp, max_tokens):
         user_message = history[-1][0]
         bot_message = ""
+        for token in generate_response(user_message, history[:-1], temp, max_tokens):
             bot_message = token
             history[-1][1] = bot_message
             yield history
     msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
         bot, [chatbot, temperature, max_new_tokens], chatbot
     )
+    clear.click(lambda: None, None, chatbot, queue=False)
 if __name__ == "__main__":
+    demo.queue(api_open=False, max_size=10)  # Limiting queue size
+    demo.launch(debug=True, show_api=True, share=False)  # Disabled sharing for security