File size: 3,813 Bytes
e24832b
8f50610
9d48224
 
128a6b8
8f50610
 
bb6a531
8f50610
 
cd33601
3e99dbe
8f50610
 
cd33601
3e99dbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128a6b8
3e99dbe
 
 
 
 
 
 
 
 
 
e24832b
3e99dbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d48224
 
3e99dbe
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaForCausalLM, LlamaConfig
import gradio as gr


# Model IDs from Hugging Face Hub
base_model_id = "HuggingFaceTB/SmolLM2-135M"
instruct_model_id = "MaxBlumenfeld/smollm2-135m-bootleg-instruct-01"

# Load tokenizer
base_tokenizer = AutoTokenizer.from_pretrained(base_model_id)

# Load models with explicit LLaMA architecture
base_model = LlamaForCausalLM.from_pretrained(base_model_id)
instruct_model = LlamaForCausalLM.from_pretrained(instruct_model_id)

def generate_response(model, tokenizer, message, temperature=0.5, max_length=200, system_prompt="", is_instruct=False):
    # Prepare input based on model type
    if is_instruct:
        if system_prompt:
            full_prompt = f"{system_prompt}\n\nHuman: {message}\nAssistant:"
        else:
            full_prompt = f"Human: {message}\nAssistant:"
    else:
        # For base model, use simpler prompt format
        full_prompt = message
    
    inputs = tokenizer(full_prompt, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model.generate(
            inputs.input_ids,
            max_length=max_length,
            do_sample=True,
            temperature=temperature,
            top_k=50,
            top_p=0.95,
            num_return_sequences=1,
            pad_token_id=tokenizer.eos_token_id  # Add padding token
        )
        
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if is_instruct:
        try:
            response = response.split("Assistant:")[-1].strip()
        except:
            pass
    else:
        response = response[len(full_prompt):].strip()
        
    return response

def chat(message, temperature, max_length, system_prompt):
    # Generate responses from both models
    base_response = generate_response(
        base_model, 
        base_tokenizer, 
        message, 
        temperature, 
        max_length, 
        system_prompt,
        is_instruct=False
    )
    
    instruct_response = generate_response(
        instruct_model, 
        base_tokenizer, 
        message, 
        temperature, 
        max_length, 
        system_prompt,
        is_instruct=True
    )
    
    return base_response, instruct_response

# Create Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("# SmolLM2-135M Comparison Demo")
    gr.Markdown("Compare responses between base and fine-tuned versions of SmolLM2-135M")
    
    with gr.Row():
        with gr.Column():
            message_input = gr.Textbox(label="Input Message")
            system_prompt = gr.Textbox(
                label="System Prompt (Optional)",
                placeholder="Set context or personality for the model",
                lines=3
            )
            
        with gr.Column():
            temperature = gr.Slider(
                minimum=0.1, 
                maximum=2.0, 
                value=0.5, 
                label="Temperature"
            )
            max_length = gr.Slider(
                minimum=50, 
                maximum=500, 
                value=200, 
                step=10, 
                label="Max Length"
            )
    
    with gr.Row():
        with gr.Column():
            gr.Markdown("### Base Model Response")
            base_output = gr.Textbox(label="Base Model (SmolLM2-135M)", lines=5)
            
        with gr.Column():
            gr.Markdown("### Bootleg Instruct Model Response") 
            instruct_output = gr.Textbox(label="Fine-tuned Model", lines=5)
    
    submit_btn = gr.Button("Generate Responses")
    submit_btn.click(
        fn=chat,
        inputs=[message_input, temperature, max_length, system_prompt],
        outputs=[base_output, instruct_output]
    )

if __name__ == "__main__":
    demo.launch()