import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os

access_token = os.getenv('HF_TOKEN')
login(access_token)

model_id = "google/gemma-2-9b-it"
tokenizer = None
model = None
model_loaded = False  # Flag to check if the model is loaded

@spaces.GPU
def load_model():
    global tokenizer, model, model_loaded
    if not model_loaded:  # Load model only if it's not already loaded
        print("Model loading started")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.bfloat16,
        )
        model_loaded = True
        print("Model loading completed. Device of the model:", model.device)
        return model, tokenizer
    else:
        print("Model is already loaded")
        return model, tokenizer

@spaces.GPU
def ask(prompt):
    if not prompt:
        return {"error": "Prompt is missing"}

    if not model_loaded:
        model, tokenizer = load_model()  # Ensure the model is loaded before processing
    print("Device of the model:", model.device)
    messages = [
        {"role": "user", "content": f"{prompt}"},
    ]
    print("Messages:", messages)
    print("Tokenizer process started")
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
    print("Tokenizer process completed")

    print("Model process started")
    outputs = model.generate(**input_ids, max_new_tokens=256)

    print("Tokenizer decode process started")
    answer = tokenizer.decode(outputs[0]).split("<end_of_turn>")[1].strip()

    return answer

demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox())
demo.launch()