import gradio as gr import spaces import torch from transformers import AutoTokenizer, AutoModelForCausalLM # Load the model and tokenizer model_name = "akjindal53244/Llama-3.1-Storm-8B" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.bfloat16, device_map="auto" ) @spaces.GPU(duration=120) def generate_text(prompt, max_length, temperature): messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] formatted_prompt = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) inputs = tokenizer(formatted_prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=max_length, do_sample=True, temperature=temperature, top_k=100, top_p=0.95, ) return tokenizer.decode(outputs[0][inputs['input_ids'].shape[1]:], skip_special_tokens=True) iface = gr.Interface( fn=generate_text, inputs=[ gr.Textbox(lines=5, label="Prompt"), gr.Slider(minimum=1, maximum=500, value=128, step=1, label="Max Length"), gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"), ], outputs=gr.Textbox(lines=10, label="Generated Text"), title="Llama-3.1-Storm-8B Text Generation", description="Enter a prompt to generate text using the Llama-3.1-Storm-8B model.", ) iface.launch()