eneSadi's picture
load model change
225f228 unverified
raw
history blame
1.88 kB
import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os
access_token = os.getenv('HF_TOKEN')
login(access_token)
model_id = "google/gemma-2-9b-it"
tokenizer = None
model = None
model_loaded = False # Flag to check if the model is loaded
@spaces.GPU
def load_model():
global tokenizer, model, model_loaded
if not model_loaded: # Load model only if it's not already loaded
print("Model loading started")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model_loaded = True
print("Model loading completed. Device of the model:", model.device)
return model, tokenizer
else:
print("Model is already loaded")
return model, tokenizer
@spaces.GPU
def ask(prompt):
if not prompt:
return {"error": "Prompt is missing"}
if not model_loaded:
model, tokenizer = load_model() # Ensure the model is loaded before processing
print("Device of the model:", model.device)
messages = [
{"role": "user", "content": f"{prompt}"},
]
print("Messages:", messages)
print("Tokenizer process started")
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
print("Tokenizer process completed")
print("Model process started")
outputs = model.generate(**input_ids, max_new_tokens=256)
print("Tokenizer decode process started")
answer = tokenizer.decode(outputs[0]).split("<end_of_turn>")[1].strip()
return answer
demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox())
demo.launch()