File size: 3,813 Bytes
e24832b 8f50610 9d48224 128a6b8 8f50610 bb6a531 8f50610 cd33601 3e99dbe 8f50610 cd33601 3e99dbe 128a6b8 3e99dbe e24832b 3e99dbe 9d48224 3e99dbe |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig
import gradio as gr
# Model IDs from Hugging Face Hub
base_model_id = "HuggingFaceTB/SmolLM2-135M"
instruct_model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct-01"
# Load tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)
# Load models with explicit LLaMA architecture
base_model = LlamaForCausalLM.from_pretrained(base_model_id)
instruct_model = LlamaForCausalLM.from_pretrained(instruct_model_id)
def generate_response(model, tokenizer, message, temperature=0.5, max_length=200, system_prompt="", is_instruct=False):
# Prepare input based on model type
if is_instruct:
if system_prompt:
full_prompt = f"{system_prompt}\n\nHuman: {message}\nAssistant:"
else:
full_prompt = f"Human: {message}\nAssistant:"
else:
# For base model, use simpler prompt format
full_prompt = message
inputs = tokenizer(full_prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
inputs.input_ids,
max_length=max_length,
do_sample=True,
temperature=temperature,
top_k=50,
top_p=0.95,
num_return_sequences=1,
pad_token_id=tokenizer.eos_token_id # Add padding token
)
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
if is_instruct:
try:
response = response.split("Assistant:")[-1].strip()
except:
pass
else:
response = response[len(full_prompt):].strip()
return response
def chat(message, temperature, max_length, system_prompt):
# Generate responses from both models
base_response = generate_response(
base_model,
base_tokenizer,
message,
temperature,
max_length,
system_prompt,
is_instruct=False
)
instruct_response = generate_response(
instruct_model,
base_tokenizer,
message,
temperature,
max_length,
system_prompt,
is_instruct=True
)
return base_response, instruct_response
# Create Gradio interface
with gr.Blocks() as demo:
gr.Markdown("# SmolLM2-135M Comparison Demo")
gr.Markdown("Compare responses between base and fine-tuned versions of SmolLM2-135M")
with gr.Row():
with gr.Column():
message_input = gr.Textbox(label="Input Message")
system_prompt = gr.Textbox(
label="System Prompt (Optional)",
placeholder="Set context or personality for the model",
lines=3
)
with gr.Column():
temperature = gr.Slider(
minimum=0.1,
maximum=2.0,
value=0.5,
label="Temperature"
)
max_length = gr.Slider(
minimum=50,
maximum=500,
value=200,
step=10,
label="Max Length"
)
with gr.Row():
with gr.Column():
gr.Markdown("### Base Model Response")
base_output = gr.Textbox(label="Base Model (SmolLM2-135M)", lines=5)
with gr.Column():
gr.Markdown("### Bootleg Instruct Model Response")
instruct_output = gr.Textbox(label="Fine-tuned Model", lines=5)
submit_btn = gr.Button("Generate Responses")
submit_btn.click(
fn=chat,
inputs=[message_input, temperature, max_length, system_prompt],
outputs=[base_output, instruct_output]
)
if __name__ == "__main__":
demo.launch() |