import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig import torch def load_model(): model_name = "TheBloke/Llama-2-13B-chat-GGUF" # Zmiana na publicznie dostępny model # Konfiguracja kwantyzacji 4-bitowej quantization_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True ) tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( model_name, device_map="auto", quantization_config=quantization_config, trust_remote_code=True ) return model, tokenizer def generate_response(prompt, max_length=100): try: inputs = tokenizer(prompt, return_tensors="pt").to(model.device) outputs = model.generate( **inputs, max_new_tokens=max_length, temperature=0.7, top_p=0.9, repetition_penalty=1.2, do_sample=True ) response = tokenizer.decode(outputs[0], skip_special_tokens=True) return response except Exception as e: return f"Error: {str(e)}" print("Ładowanie modelu...") model, tokenizer = load_model() print("Model załadowany!") iface = gr.Interface( fn=generate_response, inputs=[ gr.Textbox(label="Prompt", lines=5), gr.Slider(minimum=1, maximum=500, value=100, label="Max Length") ], outputs=gr.Textbox(label="Response", lines=5), title="Llama 2 Chat Bot", description="Bot RPG oparty na Llama 2" ) iface.launch()