import gradio as gr import spaces from transformers import AutoTokenizer, AutoModelForCausalLM import torch from huggingface_hub import login import os access_token = os.getenv('HF_TOKEN') login(access_token) model_id = "google/gemma-2-9b-it" tokenizer = None model = None model_loaded = False # Flag to check if the model is loaded @spaces.GPU def load_model(): global tokenizer, model, model_loaded if not model_loaded: # Load model only if it's not already loaded print("Model loading started") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16, ) model_loaded = True print("Model loading completed. Device of the model:", model.device) return model, tokenizer else: print("Model is already loaded") return model, tokenizer @spaces.GPU def ask(prompt): if not prompt: return {"error": "Prompt is missing"} if not model_loaded: model, tokenizer = load_model() # Ensure the model is loaded before processing print("Device of the model:", model.device) messages = [ {"role": "user", "content": f"{prompt}"}, ] print("Messages:", messages) print("Tokenizer process started") input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda") print("Tokenizer process completed") print("Model process started") outputs = model.generate(**input_ids, max_new_tokens=256) print("Tokenizer decode process started") answer = tokenizer.decode(outputs[0]).split("")[1].strip() return answer demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox()) demo.launch()