File size: 4,922 Bytes
b14d567
 
 
 
 
 
283c0e7
878e432
b14d567
 
 
 
 
 
 
fb60bd2
b14d567
878e432
b14d567
 
 
 
 
 
 
 
 
 
283c0e7
 
b14d567
 
878e432
 
 
 
 
 
 
 
b14d567
 
 
 
 
 
 
 
 
 
 
 
 
 
 
878e432
 
 
 
 
 
 
 
 
 
 
b14d567
 
 
 
878e432
fb60bd2
878e432
 
 
8409b95
b14d567
 
fb60bd2
878e432
 
 
 
 
 
fb60bd2
 
 
 
 
 
 
b14d567
 
 
 
 
878e432
 
 
 
 
 
 
b14d567
 
 
 
 
 
 
08f0bdc
 
b14d567
 
878e432
b14d567
 
08f0bdc
b14d567
 
 
 
 
 
 
 
 
 
fb60bd2
 
 
878e432
b14d567
 
878e432
 
b14d567
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
<script lang="ts">
    import Textarea from "@/lib/components/ui/textarea/textarea.svelte";
    import Badge from "@/lib/components/ui/badge/badge.svelte";
    import * as webllm from "@mlc-ai/web-llm";
    import { onMount } from 'svelte';

    let selectedModel = "smollm-360M-instruct-add-basics-q0f32-MLC";

    let engine: webllm.MLCEngineInterface;
    let isLoading = false;
    let loadingStatus = '';
    let inputText = '';
    let outputText = '';
    let error = '';
    let completionSpeed: number | null = null;
    let tokensPerSecond: number | null = null;
    let isGenerating = false;
    let pendingRequest: string | null = null;

    async function loadWebLLM() {
        isLoading = true;
        error = '';
        const initProgressCallback = (report: webllm.InitProgressReport) => {
            loadingStatus = report.text;
        };

        const appConfig: webllm.AppConfig = {
            model_list: [{
                model: `https://huggingface.co/reach-vb/smollm-360M-instruct-add-basics-q0f32-MLC`,
                model_id: 'smollm-360M-instruct-add-basics-q0f32-MLC',
                model_lib: `${webllm.modelLibURLPrefix}${webllm.modelVersion}/SmolLM-360M-Instruct-q4f16_1-ctx2k_cs1k-webgpu.wasm`,
                overrides: { context_window_size: 2048 },
            },
            {
                model: `https://huggingface.co/mlc-ai/Qwen2-0.5B-Instruct-q4f16_1-MLC`,
                model_id: 'Qwen2-0.5B-Instruct-q4f16_1-MLC',
                model_lib: `${webllm.modelLibURLPrefix}${webllm.modelVersion}/Qwen2-0.5B-Instruct-q4f16_1-ctx4k_cs1k-webgpu.wasm`,
                overrides: { context_window_size: 2048 },
            }
        ],
        };

        try {
            engine = await webllm.CreateMLCEngine(selectedModel, {
                appConfig,
                initProgressCallback,
                logLevel: "INFO",
            });
        } catch (err) {
            error = `Failed to load the model: ${(err as Error).message}`;
        } finally {
            isLoading = false;
        }
    }

    async function generateCompletion(content: string) {
        if (!engine || isGenerating) {
            /**
             * This is used to store the most recent request from user
             * while the current request is being processed.
             */
            pendingRequest = content.trim();
            return;
        }

        if (!content.trim()) return;

        isGenerating = true;
        const startTime = performance.now();
        try {
            console.log("Generating completion:", content);
            const response = await engine.chat.completions.create({
                messages: [
                    {role: "user", content: content}
                ],
                max_tokens: 30,
            });

            outputText = response.choices[0].message.content || "";

            // indicate that the response was cut short
            if (response.choices[0].finish_reason === "length") {
                outputText += "...";
            }

            const endTime = performance.now();
            const elapsedTimeInSeconds = (endTime - startTime) / 1000;
            completionSpeed = Math.round(endTime - startTime);
            
            const generatedTokens = response.usage?.completion_tokens || 0;
            tokensPerSecond = Math.round(generatedTokens / elapsedTimeInSeconds);
            
            error = '';
        } catch (err) {
            error = `Error: ${(err as Error).message}`;
        } finally {
            isGenerating = false;
            
            // process pending request if exists
            if (pendingRequest && pendingRequest !== content) {
                const nextRequest = pendingRequest;
                pendingRequest = null;
                await generateCompletion(nextRequest);
            }
        }
    }

    onMount(loadWebLLM);
</script>

<div class="flex my-20 flex-col items-center gap-4 max-w-lg mx-auto">
    <h1 class="text-center font-mono font-bold text-4xl">SmolLM 🤗</h1>
    <p class="text-center font-mono text-sm mb-4">Powered by {selectedModel}</p>
    <Textarea 
        bind:value={inputText} 
        on:input={() => generateCompletion(inputText)} 
        disabled={isLoading}
        class="w-full" 
        placeholder="Say something..."
    />
    {#if isLoading}
        <p class="text-sm text-slate-600 text-center">{loadingStatus}</p>
    {:else if error}
        <p class="text-sm text-red-600">{error}</p>
    {:else}
        <div class="flex gap-2">
            {#if completionSpeed !== null}
                <Badge>{completionSpeed}ms</Badge>
            {/if}
            {#if tokensPerSecond !== null}
                <Badge>{tokensPerSecond} tok/s</Badge>
            {/if}
            <Badge>{selectedModel}</Badge>
        </div>
    {/if}
    <pre class="text-lg font-bold whitespace-pre-wrap">{outputText}</pre>

</div>