Spaces:
Running
Running
File size: 3,442 Bytes
fc973c2 93013c6 6c94b18 93013c6 fc973c2 e5a75a4 6c94b18 ca79346 6c94b18 ca79346 fc973c2 ca79346 6c94b18 93013c6 6c94b18 ca79346 6c94b18 fc973c2 6c94b18 fc973c2 6c94b18 fc973c2 6c94b18 fc973c2 6c94b18 e5a75a4 fc973c2 6c94b18 fc973c2 6c94b18 ca79346 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
import gradio as gr
import pandas as pd
from gradio.themes import colors
from transformers import AutoTokenizer
os.environ['TOKENIZERS_PARALLELISM'] = "false"
# Function to map tokenized text to IDs
def inference(
text="",
model_id="openai/clip-vit-large-patch14",
progress=gr.Progress()
) -> (list[str, str], list[str, str], pd.DataFrame):
if text == "":
return [], [], pd.DataFrame()
progress(0, desc='Loading tokenizer...')
tokenizer = AutoTokenizer.from_pretrained(model_id)
# Use tokenizer to tokenize the text
progress(0.5, desc='Tokenizing text...')
text_inputs = tokenizer(text, return_tensors='pt')
input_ids = text_inputs['input_ids'].tolist()[0] # Convert tensor to list
# Create pairs of tokens and IDs
tokens = [tokenizer.decode([id_]) for id_ in input_ids]
token_pairs = []
for token, id_ in zip(tokens, input_ids):
token_pairs.append((token, str(id_)))
# Count the number of characters and tokens
pos_count = pd.DataFrame({
"Char Count": [len(text)],
"Token Count": [len(token_pairs)]
})
# Create list of special tokens
special_tokens = []
for k, v in tokenizer.special_tokens_map.items():
if k == 'additional_special_tokens':
continue
sp_token_map = [str(k), str(v)]
special_tokens.append(sp_token_map)
return token_pairs, special_tokens, pos_count
if __name__ == '__main__':
iface = gr.Interface(
fn=inference,
inputs=[
gr.Textbox(label="Text"),
gr.Dropdown(
label="Model",
choices=[
"openai/clip-vit-large-patch14",
"google/gemma-7b",
"google-bert/bert-base-uncased",
"google/flan-t5-base",
"openai-community/gpt2",
"rinna/japanese-gpt-1b",
"cyberagent/open-calm-7b",
],
value="openai/clip-vit-large-patch14"
),
],
outputs=[
gr.Highlightedtext(label="Highlighted Text"),
gr.Highlightedtext(label="Special Tokens", combine_adjacent=True, adjacent_separator=' / '),
gr.Dataframe(label="Position Count"),
],
examples=[
["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"],
["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, "
"but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"],
["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?",
"google/flan-t5-base"],
["In my home country, it's a custom to say 'いただきマサチューセッツ' before we start eating a meal.",
"google/gemma-7b"],
["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"],
],
cache_examples=True,
title="TokenVisor 👀",
description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.",
theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow),
allow_flagging="never",
)
iface.queue().launch()
|