Spaces:
Sleeping
Sleeping
import gradio as gr | |
import spaces | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
import torch | |
from huggingface_hub import login | |
import os | |
access_token = os.getenv('HF_TOKEN') | |
login(access_token) | |
model_id = "google/gemma-2-9b-it" | |
tokenizer = None | |
model = None | |
model_loaded = False # Flag to check if the model is loaded | |
def load_model(): | |
global tokenizer, model, model_loaded | |
if not model_loaded: # Load model only if it's not already loaded | |
print("Model loading started") | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
device_map="auto", | |
torch_dtype=torch.bfloat16, | |
) | |
model_loaded = True | |
print("Model loading completed. Device of the model:", model.device) | |
return model, tokenizer | |
else: | |
print("Model is already loaded") | |
return model, tokenizer | |
def ask(prompt): | |
if not prompt: | |
return {"error": "Prompt is missing"} | |
if not model_loaded: | |
model, tokenizer = load_model() # Ensure the model is loaded before processing | |
print("Device of the model:", model.device) | |
messages = [ | |
{"role": "user", "content": f"{prompt}"}, | |
] | |
print("Messages:", messages) | |
print("Tokenizer process started") | |
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda") | |
print("Tokenizer process completed") | |
print("Model process started") | |
outputs = model.generate(**input_ids, max_new_tokens=256) | |
print("Tokenizer decode process started") | |
answer = tokenizer.decode(outputs[0]).split("<end_of_turn>")[1].strip() | |
return answer | |
demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox()) | |
demo.launch() | |