File size: 3,483 Bytes
fc973c2
 
93013c6
6c94b18
 
 
93013c6
fc973c2
 
e5a75a4
6c94b18
 
 
 
ca79346
 
6c94b18
ca79346
fc973c2
ca79346
6c94b18
93013c6
6c94b18
ca79346
6c94b18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fc973c2
 
 
 
 
 
 
 
 
 
6c94b18
 
 
 
 
 
 
 
 
 
 
fc973c2
4f22162
6c94b18
 
 
fc973c2
 
6c94b18
 
 
 
 
 
fc973c2
6c94b18
 
 
 
 
 
 
e5a75a4
fc973c2
 
 
6c94b18
 
fc973c2
6c94b18
 
 
 
 
ca79346
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import os

import gradio as gr
import pandas as pd
from gradio.themes import colors
from transformers import AutoTokenizer

os.environ['TOKENIZERS_PARALLELISM'] = "false"


# Function to map tokenized text to IDs
def inference(
        text="",
        model_id="openai/clip-vit-large-patch14",
        progress=gr.Progress()
) -> (list[str, str], list[str, str], pd.DataFrame):
    if text == "":
        return [], [], pd.DataFrame()

    progress(0, desc='Loading tokenizer...')
    tokenizer = AutoTokenizer.from_pretrained(model_id)

    # Use tokenizer to tokenize the text
    progress(0.5, desc='Tokenizing text...')
    text_inputs = tokenizer(text, return_tensors='pt')

    input_ids = text_inputs['input_ids'].tolist()[0]  # Convert tensor to list

    # Create pairs of tokens and IDs
    tokens = [tokenizer.decode([id_]) for id_ in input_ids]
    token_pairs = []

    for token, id_ in zip(tokens, input_ids):
        token_pairs.append((token, str(id_)))

    # Count the number of characters and tokens
    pos_count = pd.DataFrame({
        "Char Count": [len(text)],
        "Token Count": [len(token_pairs)]
    })

    # Create list of special tokens
    special_tokens = []
    for k, v in tokenizer.special_tokens_map.items():
        if k == 'additional_special_tokens':
            continue
        sp_token_map = [str(k), str(v)]
        special_tokens.append(sp_token_map)

    return token_pairs, special_tokens, pos_count


if __name__ == '__main__':
    iface = gr.Interface(
        fn=inference,
        inputs=[
            gr.Textbox(label="Text"),
            gr.Dropdown(
                label="Model",
                choices=[
                    "openai/clip-vit-large-patch14",
                    "google/gemma-7b",
                    "google/gemma-2-9b",
                    "google-bert/bert-base-uncased",
                    "google/flan-t5-base",
                    "openai-community/gpt2",
                    "rinna/japanese-gpt-1b",
                    "cyberagent/open-calm-7b",
                ],
                value="openai/clip-vit-large-patch14"
            ),
        ],
        outputs=[
            gr.Highlightedtext(label="Highlighted Text"),
            gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
            gr.Dataframe(label="Position Count"),
        ],
        examples=[
            ["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
            ["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
             "but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
            ["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
             "google/flan-t5-base"],
            ["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
             "google/gemma-7b"],
            ["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"],
        ],
        cache_examples=True,
        title="TokenVisor 👀",
        description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
        theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
        allow_flagging="never",

    )
    iface.queue().launch()