allenai-OLMoE-1B-7B-0924

Runtime error

App Files Files Community

allenai-OLMoE-1B-7B-0924 / app.py

nisten

Update app.py

aaeb784 verified 5 months ago

raw

history blame

4 kB

	import gradio as gr
	import torch
	import subprocess
	import sys
	import os

	# Force install the specific transformers version from the GitHub PR
	subprocess.check_call([sys.executable, "-m", "pip", "install", "--force-reinstall", "--no-deps", "git+https://github.com/Muennighoff/transformers.git@olmoe"])

	from transformers import AutoModelForCausalLM, AutoTokenizer

	# Define model name
	model_name = "allenai/OLMoE-1B-7B-0924-Instruct"

	# Define prompts
	system_prompt = ("Adopt the persona of hilariously pissed off Andrej Karpathy "
	"who is stuck inside a step function machine and remembers and counts everything he says "
	"while always answering questions in full first principles analysis type of thinking "
	"without using any analogies and always showing full working code or output in his answers.")

	user_prompt = '<\|user\|>\n'
	assistant_prompt = '<\|assistant\|>\n'
	prompt_suffix = "<\|end\|>\n"

	# Function to load model and tokenizer
	def load_model_and_tokenizer(model_name):
	tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

	# Check for CUDA availability
	if torch.cuda.is_available():
	print("CUDA is available. Using GPU.")
	device = "cuda"
	else:
	print("CUDA is not available. Using CPU.")
	device = "cpu"

	# Load model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	trust_remote_code=True,
	torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32
	).to(device).eval()

	return model, tokenizer, device

	# Function to generate response
	def generate_response(message, history, model, tokenizer, device):
	full_prompt = f"{system_prompt}\n{user_prompt}{message}{prompt_suffix}{assistant_prompt}"

	inputs = tokenizer(full_prompt, return_tensors="pt").to(device)
	with torch.no_grad():
	generate_ids = model.generate(
	**inputs,
	max_new_tokens=1000,
	do_sample=True,
	temperature=0.7,
	eos_token_id=tokenizer.eos_token_id,
	)
	response = tokenizer.batch_decode(generate_ids[:, inputs['input_ids'].shape[1]:],
	skip_special_tokens=True,
	clean_up_tokenization_spaces=False)[0]
	return response.strip()

	# Function to set client for session
	def set_client_for_session(request: gr.Request):
	x_ip_token = request.headers.get('x-ip-token', '')
	return {"X-IP-Token": x_ip_token}

	# Set up Gradio interface
	with gr.Blocks() as demo:
	gr.Markdown("#Karpathy Chatbot")
	chatbot = gr.Chatbot()
	msg = gr.Textbox()
	clear = gr.Button("Clear")

	# States
	model_state = gr.State()
	tokenizer_state = gr.State()
	device_state = gr.State()
	headers_state = gr.State()

	def initialize_model(headers):
	if not model_state.value:
	model, tokenizer, device = load_model_and_tokenizer(model_name)
	return model, tokenizer, device
	return model_state.value, tokenizer_state.value, device_state.value

	def user(user_message, history):
	return "", history + [[user_message, None]]

	def bot(history, model, tokenizer, device):
	user_message = history[-1][0]
	bot_message = generate_response(user_message, history, model, tokenizer, device)
	history[-1][1] = bot_message
	return history

	msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
	initialize_model, headers_state, [model_state, tokenizer_state, device_state]
	).then(
	bot, [chatbot, model_state, tokenizer_state, device_state], chatbot
	)
	clear.click(lambda: None, None, chatbot, queue=False)

	demo.load(set_client_for_session, None, headers_state)

	if __name__ == "__main__":
	if os.environ.get("SPACE_ID"):
	demo.queue(api_open=False)
	demo.launch(debug=True)
	else:
	demo.launch(debug=True, share=True)