Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,008 Bytes
5956319 74995d7 5956319 3bc8972 a505b42 4ea1c4e 9c1d271 76789b2 87956ea 76789b2 5956319 9c1d271 5956319 434dec3 9a9a69d 5f7b7b0 87956ea 3bc8972 4ea1c4e 87956ea 3bc8972 89bcd26 3bc8972 9c1d271 87956ea 3bc8972 87956ea 3bc8972 ef2a575 a505b42 ef2a575 ba9f5b4 3fbf6d6 9a367a8 a505b42 395f92e 46707cf 8e4674b 46707cf 4e58f83 d20a350 d2b7f91 3fbf6d6 4ea1c4e 3fbf6d6 4ea1c4e 9a9a69d ea66e79 8d68072 9a9a69d a505b42 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 |
import spaces
import gradio as gr
import torch
import subprocess
import requests
from gradio import State
import asyncio
# Function to start the ochat server
@spaces.GPU
def start_ochat_server():
print(f"Is CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
command = [
"python", "-m", "ochat.serving.openai_api_server",
"--model", "openchat/openchat_3.5"
]
# Start the server in a separate process
try:
subprocess.Popen(command)
return "ochat server started successfully"
except Exception as e:
return f"Failed to start ochat server: {e}"
start_ochat_server()
# Function to check if the server is up
def is_server_up(url):
try:
response = requests.get(url)
return response.status_code == 200
except requests.RequestException:
return False
# Function to send a message to the ochat server and get a response
async def chat_with_ochat(message):
base_url = "http://localhost:18888"
chat_url = f"{base_url}/v1/chat/completions"
headers = {"Content-Type": "application/json"}
data = {
"model": "openchat_3.5",
"messages": [{"role": "user", "content": message}]
}
# Check if server is up
if not is_server_up(base_url):
return "Error: oChat server is not running."
try:
response = requests.post(chat_url, json=data, headers=headers)
if response.status_code == 200:
return response.json()['choices'][0]['message']['content']
else:
return f"Error: Server responded with status code {response.status_code}"
except requests.RequestException as e:
return f"Error: {e}"
# Create a Gradio Blocks interface with session state
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("## vLLM OpenChat-3.5 Interface")
gr.Markdown("### the vLLM server cannot handle concurrent users in spaces. If you get an error, run it on docker.")
gr.Markdown("This will run better on your own machine: ```docker run -it -p 7860:7860 --platform=linux/amd64 --gpus all \
registry.hf.space/macadeliccc-openchat-3-5-chatbot:latest python app.py```")
message = gr.Textbox(label="Your Message", placeholder="Type your message here")
chatbot = gr.Chatbot()
clear = gr.Button("Clear")
history = State([]) # Session state for chat history
def user(message, history):
return "", history + [[message, None]]
def bot(history):
if history and history[-1] and history[-1][0]:
user_message = history[-1][0]
bot_response = chat_with_ochat(user_message)
history[-1][1] = bot_response # Update the last entry with the bot's response
return history
message.submit(user, [message, chatbot], [message, chatbot], queue=False).then(
bot, chatbot, chatbot
)
clear.click(lambda: None, None, chatbot, queue=False)
app.launch()
|