Spaces:

macadeliccc
/

laser-dolphin-mixtral-chat

Running on Zero

App Files Files Community

macadeliccc commited on Nov 28, 2023

Commit

f316cfc

1 Parent(s): 86efe77

change to starling-LM-7B

Browse files

Files changed (1) hide show

app.py +42 -104

app.py CHANGED Viewed

@@ -1,117 +1,55 @@
 import spaces
 import gradio as gr
 import torch
-import subprocess
-import aiohttp
 from gradio import State
-import asyncio
-import json
-import asyncio
-import threading
-import time
-# Function to start the ochat server
-@spaces.GPU
-def start_ochat_server():
-    print(f"Is CUDA available: {torch.cuda.is_available()}")
-    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
-    command = [
-        "python", "-m", "ochat.serving.openai_api_server",
-        "--model", "openchat/openchat_3.5"
-    ]
-    # Start the server in a separate process
-    try:
-        subprocess.Popen(command)
-        return "ochat server started successfully"
-    except Exception as e:
-        return f"Failed to start ochat server: {e}"
-start_ochat_server()
-# async def is_server_running():
-#     async with aiohttp.ClientSession() as session:
-#         try:
-#             async with session.get("http://localhost:18888/v1/chat/completions") as response:
-#                 return response.status == 200 or response.status == 400 or response.status == 422
-#         except aiohttp.ClientError:
-#             return False
-# async def monitor_server():
-#     # Wait for 5 minutes before starting to monitor
-#     await asyncio.sleep(300)
-#     while True:
-#         if not await is_server_running():
-#             print("Server is not running. Attempting to restart...")
-#             start_ochat_server()
-#         await asyncio.sleep(60)
-# def run_async_monitor():
-#     time.sleep(120)
-#     loop = asyncio.new_event_loop()
-#     asyncio.set_event_loop(loop)
-#     loop.run_until_complete(monitor_server())
-#     loop.close()
-# # Start the monitoring in a separate thread
-# thread = threading.Thread(target=run_async_monitor)
-# thread.start()
-# Function to send a message to the ochat server and get a response
-async def chat_with_ochat(message):
-    base_url = "http://localhost:18888"
-    chat_url = f"{base_url}/v1/chat/completions"
-    headers = {"Content-Type": "application/json"}
-    data = {
-        "model": "openchat_3.5",
-        "messages": [{"role": "user", "content": message}]
-    }
-    async with aiohttp.ClientSession() as session:
-        try:
-            async with session.post(chat_url, headers=headers, json=data) as response:
-                if response.status == 200:
-                    response_data = await response.json()
-                    return response_data['choices'][0]['message']['content']
-                else:
-                    return f"Error: Server responded with status code {response.status}"
-        except aiohttp.ClientError as e:
-            return f"Error: {e}"
-# Create a Gradio Blocks interface with session state
-with gr.Blocks(theme=gr.themes.Soft()) as app:
-    gr.Markdown("## vLLM OpenChat-3.5 Interface")
-    gr.Markdown("### the vLLM server cannot handle concurrent users in spaces. If you get an error, run it on docker.")
-    gr.Markdown("This will run better on your own machine: ```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \
-    registry.hf.space/macadeliccc-openchat-3-5-chatbot:latest python app.py```")
-    message = gr.Textbox(label="Your Message", placeholder="Type your message here...")
-    chatbot = gr.Chatbot()
-    clear = gr.Button("Clear")
-    history = State([])  # Session state for chat history
-    async def user(message, history):
-        return "", history + [[message, None]]
-    async def bot(history):
-        if history and history[-1] and history[-1][0]:
-            user_message = history[-1][0]
-            bot_response = await chat_with_ochat(user_message)
-            history[-1][1] = bot_response  # Update the last entry with the bot's response
-        return history
-    message.submit(user, [message, chatbot], [message, chatbot], queue=True).then(
-        bot, chatbot, chatbot
-    )
-    clear.click(lambda: None, None, chatbot, queue=False)
-app.queue()
 app.launch()

 import spaces
 import gradio as gr
 import torch
 from gradio import State
+from transformers import AutoTokenizer, AutoModelForCausalLM
+# Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("berkeley-nest/Starling-LM-7B-alpha")
+model = AutoModelForCausalLM.from_pretrained("berkeley-nest/Starling-LM-7B-alpha")
+# Ensure the model is in evaluation mode
+model.eval()
+# Move model to GPU if available
+if torch.cuda.is_available():
+    model = model.to("cuda")
+@spaces.GPU
+def generate_response(user_input, chat_history):
+    prompt = "GPT4 Correct User: " + user_input + "GPT4 Correct Assistant: "
+    if chat_history:
+        prompt = chat_history + prompt
+    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
+    # Move tensors to the same device as model
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        output = model.generate(**inputs, max_length=1024, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
+    response = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Update chat history
+    new_history = prompt + response
+    return response, new_history
+# Gradio Interface
+def clear_chat():
+    return "", ""
+with gr.Blocks(gr.themes.Soft()) as app:
+    with gr.Row():
+        with gr.Column():
+            user_input = gr.Textbox(label="Your Message", placeholder="Type your message here...")
+            send = gr.Button("Send")
+            clear = gr.Button("Clear")
+        with gr.Column():
+            chatbot = gr.Chatbot()
+    chat_history = gr.State()  # Holds the chat history
+    send.click(generate_response, inputs=[user_input, chat_history], outputs=[chatbot, chat_history])
+    clear.click(clear_chat, outputs=[chatbot, chat_history])
 app.launch()