Spaces:

Spestly
/

Athena-1-0.5B

Sleeping

App Files Files Community

Spestly commited on about 1 month ago

Commit

425e273

verified ·

1 Parent(s): 7409be6

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -120

app.py CHANGED Viewed

@@ -1,130 +1,53 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer
 import torch
-import gc
-# Global model and tokenizer
-model = None
-tokenizer = None
-def load_model():
-    """Load the model and tokenizer into memory."""
-    global model, tokenizer
-    model_name = "Spestly/Athena-1-0.5B"  # Replace with a smaller or quantized model for better performance on CPU
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name,
-        low_cpu_mem_usage=True,
-        torch_dtype=torch.float32,  # Keep float32 for CPU usage
-        device_map="cpu"
-    )
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    model.eval()
-def generate_response(input_text, chat_history):
-    """Generate a response using the model and update the chat history."""
-    global model, tokenizer
-    # Load model if not loaded
-    if model is None or tokenizer is None:
-        load_model()
-    try:
-        # Alpaca Chat Template
-        instruction = (
-            "You are an LLM called Athena. You are finetuned by Aayan Mishra (Spestly). Below is an instruction that describes a task. "
-            "Write a response that appropriately completes the request.\n\n"
-            f"### Instruction:\n{input_text}\n\n### Response:"
-        )
-        chat_history.append({"role": "user", "content": input_text})  # Add user input to chat history
-        # Tokenization
-        inputs = tokenizer(
-            instruction,
-            return_tensors="pt",
-            truncation=True,
-            max_length=256  # Limit input length for CPU performance
-        )
-        # Generate response
-        with torch.no_grad():
-            outputs = model.generate(
-                input_ids=inputs["input_ids"],
-                attention_mask=inputs["attention_mask"],
-                max_new_tokens=100,  # Limit the number of tokens generated
-                do_sample=True,
-                top_k=40,  # Adjust top_k for faster response
-                top_p=0.85,  # Adjust top_p for faster sampling
-                temperature=0.7,
-                pad_token_id=tokenizer.pad_token_id,
-                eos_token_id=tokenizer.eos_token_id,
-                repetition_penalty=1.2,
-                num_beams=1  # Use single beam for faster processing
-            )
-        # Decode response
-        response = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
-        response = response.split("### Response:")[-1].strip()  # Extract model's response
-        # Update chat history
-        chat_history.append({"role": "assistant", "content": response})
-        # Manual garbage collection for CPU usage
-        gc.collect()
-        torch.cuda.empty_cache()
-        return chat_history
-    except Exception as e:
-        return chat_history + [{"role": "error", "content": f"Error: {str(e)}"}]
-# Gradio UI
-def render_chat(chat_history):
-    """
-    Render the chat history into a format that mimics the ChatGPT UI.
-    """
-    chat_ui = ""
-    for entry in chat_history:
-        if entry["role"] == "user":
-            chat_ui += f'<div class="user-message"><b>User:</b> {entry["content"]}</div>'
-        elif entry["role"] == "assistant":
-            chat_ui += f'<div class="assistant-message"><b>Athena:</b> {entry["content"]}</div>'
-        elif entry["role"] == "error":
-            chat_ui += f'<div class="error-message"><b>Error:</b> {entry["content"]}</div>'
-    return chat_ui
-with gr.Blocks(css="""
-    body { background-color: #202123; color: white; font-family: 'Arial', sans-serif; margin: 0; padding: 0; }
-    .chat-container { background-color: #333; border-radius: 10px; padding: 15px; max-height: 500px; overflow-y: auto; }
-    .user-message { text-align: left; margin: 10px 0; padding: 10px; background-color: #444; border-radius: 10px; }
-    .assistant-message { text-align: left; margin: 10px 0; padding: 10px; background-color: #555; border-radius: 10px; }
-    .error-message { text-align: center; color: red; margin: 10px 0; padding: 10px; border: 1px solid red; border-radius: 10px; }
-    .input-container { position: fixed; bottom: 0; width: 100%; background-color: #202123; padding: 15px; border-top: 1px solid #444; }
-    .input-box { width: calc(100% - 30px); padding: 10px; border-radius: 10px; border: 1px solid #444; background-color: #333; color: white; }
-    .submit-button { background-color: #10a37f; color: white; border: none; padding: 10px; border-radius: 10px; cursor: pointer; }
-    .submit-button:hover { background-color: #0e8d69; }
-""") as demo:
-    gr.Markdown("<h1 style='text-align: center;'>Athena-1 1.5B</h1>")
-    # Chat history
-    chat_history = gr.State([])
-    chat_display = gr.HTML(label="Chat History")
-    # Input box
-    with gr.Row(elem_id="input-container"):
-        user_input = gr.Textbox(placeholder="Type your message here...", elem_id="input-box")
-        submit_button = gr.Button("Submit", elem_id="submit-button")
-    # Submit button functionality
-    submit_button.click(
-        generate_response,
-        inputs=[user_input, chat_history],
-        outputs=[chat_history]
-    ).then(
-        render_chat,
-        inputs=[chat_history],
-        outputs=[chat_display]
-    )
-# Launch the app
-demo.launch()

 import gradio as gr
+from transformers import AutoTokenizer, AutoModelForCausalLM
 import torch
+# Load model and tokenizer
+model_name = "Spestly/Athena-1-0.5B"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float32, low_cpu_mem_usage=True)
+# Set to evaluation mode
+model.eval()
+def generate_response(message, history):
+    instruction = (
+        "You are an LLM called Athena. You are finetuned by Aayan Mishra. You are NOT trained by Anthropic. "
+        "You are a Qwen 2.5 fine-tune. Your purpose is the help the user accomplish their request to the best of your abilities. "
+        "Below is an instruction that describes a task. Answer it clearly and concisely.\n\n"
+        f"### Instruction:\n{message}\n\n### Response:"
+    )
+    inputs = tokenizer(instruction, return_tensors="pt")
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=100,
+            num_return_sequences=1,
+            temperature=0.7,
+            top_p=0.9,
+            do_sample=True
+        )
+    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    response = response.split("### Response:")[-1].strip()
+    return response
+iface = gr.ChatInterface(
+    generate_response,
+    chatbot=gr.Chatbot(height=600, type="messages"),
+    textbox=gr.Textbox(placeholder="Type your message here...", container=False, scale=7),
+    title="Athena-1",
+    description="Chat with Athena-1 0.5B",
+    theme="soft",
+    examples=[
+        "Can you give me a good salsa recipe?",
+        "What are Neural Networks?",
+        "What is the capital of Australia?",
+    ],
+    type="messages"
+)
+iface.launch()