Spaces:

macadeliccc
/

laser-dolphin-mixtral-chat

Running on Zero

File size: 3,008 Bytes

5956319
74995d7
5956319
 
3bc8972
a505b42
4ea1c4e
9c1d271
 
76789b2
87956ea
76789b2
 
 
5956319
 
 
 
 
9c1d271
5956319
 
 
 
 
 
434dec3
 
9a9a69d
5f7b7b0
87956ea
 
 
 
 
 
 
 
3bc8972
4ea1c4e
87956ea
 
3bc8972
 
 
89bcd26
3bc8972
9c1d271
87956ea
 
 
 
3bc8972
87956ea
3bc8972
 
 
 
ef2a575
 
 
a505b42
ef2a575
ba9f5b4
3fbf6d6
9a367a8
a505b42
395f92e
46707cf
8e4674b
46707cf
 
4e58f83
d20a350
d2b7f91
3fbf6d6
4ea1c4e
 
 
3fbf6d6
4ea1c4e
 
 
 
 
 
9a9a69d
ea66e79
8d68072
9a9a69d
a505b42

import spaces
import gradio as gr
import torch
import subprocess
import requests
from gradio import State
import asyncio

# Function to start the ochat server
@spaces.GPU
def start_ochat_server():
    print(f"Is CUDA available: {torch.cuda.is_available()}")
    print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")

    command = [
        "python", "-m", "ochat.serving.openai_api_server", 
        "--model", "openchat/openchat_3.5"
    ]

    # Start the server in a separate process
    try:
        subprocess.Popen(command)
        return "ochat server started successfully"
    except Exception as e:
        return f"Failed to start ochat server: {e}"

start_ochat_server()



# Function to check if the server is up
def is_server_up(url):
    try:
        response = requests.get(url)
        return response.status_code == 200
    except requests.RequestException:
        return False

# Function to send a message to the ochat server and get a response
async def chat_with_ochat(message):
    base_url = "http://localhost:18888"
    chat_url = f"{base_url}/v1/chat/completions"
    headers = {"Content-Type": "application/json"}
    data = {
        "model": "openchat_3.5",
        "messages": [{"role": "user", "content": message}]
    }

    # Check if server is up
    if not is_server_up(base_url):
        return "Error: oChat server is not running."

    try:
        response = requests.post(chat_url, json=data, headers=headers)
        if response.status_code == 200:
            return response.json()['choices'][0]['message']['content']
        else:
            return f"Error: Server responded with status code {response.status_code}"
    except requests.RequestException as e:
        return f"Error: {e}"

# Create a Gradio Blocks interface with session state
with gr.Blocks(theme=gr.themes.Soft()) as app:
    gr.Markdown("## vLLM OpenChat-3.5 Interface")
    gr.Markdown("### the vLLM server cannot handle concurrent users in spaces. If you get an error, run it on docker.")
    gr.Markdown("This will run better on your own machine: ```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \
    registry.hf.space/macadeliccc-openchat-3-5-chatbot:latest python app.py```")


    message = gr.Textbox(label="Your Message", placeholder="Type your message here")
    chatbot = gr.Chatbot()
    clear = gr.Button("Clear")

    history = State([])  # Session state for chat history

    def user(message, history):
        return "", history + [[message, None]]


    def bot(history):
        if history and history[-1] and history[-1][0]:
            user_message = history[-1][0]
            bot_response = chat_with_ochat(user_message)
            history[-1][1] = bot_response  # Update the last entry with the bot's response
        return history

    message.submit(user, [message, chatbot], [message, chatbot], queue=False).then(
        bot, chatbot, chatbot
    )
    clear.click(lambda: None, None, chatbot, queue=False)
app.launch()