Spaces:
Sleeping
Sleeping
import gradio as gr | |
import pandas as pd | |
from gradio.themes import colors | |
from transformers import AutoTokenizer | |
# Function to map tokenized text to IDs | |
def inference( | |
text="", | |
model_id="openai/clip-vit-large-patch14", | |
) -> (list[str, str], pd.DataFrame): | |
if text == "": | |
return [], pd.DataFrame() | |
tokenizer = AutoTokenizer.from_pretrained(model_id) | |
# Use tokenizer to tokenize the text | |
text_inputs = tokenizer(text, return_tensors='pt') | |
input_ids = text_inputs['input_ids'].tolist()[0] # Convert tensor to list | |
# Create pairs of tokens and IDs | |
tokens = [tokenizer.decode([id_]) for id_ in input_ids] | |
token_pairs = [] | |
for token, id_ in zip(tokens, input_ids): | |
token_pairs.append((token, str(id_))) | |
# Count the number of characters and tokens | |
pos_count = pd.DataFrame({ | |
"Char Count": [len(text)], | |
"Token Count": [len(token_pairs)] | |
}) | |
return token_pairs, pos_count | |
if __name__ == '__main__': | |
iface = gr.Interface( | |
fn=inference, | |
inputs=[ | |
gr.Textbox(label="Text"), | |
gr.Dropdown( | |
label="Model", | |
choices=[ | |
"openai/clip-vit-large-patch14", | |
"google-bert/bert-base-uncased", | |
"google/flan-t5-base", | |
"openai-community/gpt2", | |
"rinna/japanese-gpt-1b" | |
], | |
value="openai/clip-vit-large-patch14" | |
), | |
], | |
outputs=[ | |
gr.Highlightedtext(label="Highlighted Text"), | |
gr.Dataframe(label="Position Count"), | |
], | |
examples=[ | |
["When I told my computer I needed a break, it froze.", "openai/clip-vit-large-patch14"], | |
["Yesterday, I thought my cat was studying for her degree in philosophy because she sat on my book, " | |
"but turns out she was just trying to hatch a plot to steal my dinner.", "openai/clip-vit-large-patch14"], | |
["The square root of x is the cube root of y. What is y to the power of 2, if x = 4?", | |
"google/flan-t5-base"], | |
["日本で一番高い山は富士山ですが、二番目に高い山は何ですか?", "rinna/japanese-gpt-1b"] | |
], | |
cache_examples=True, | |
title="TokenVisor", | |
description="Visualize how the Tokenizer used in Hugging Face's Transformers library tokenizes text.", | |
theme=gr.Theme(primary_hue=colors.green, secondary_hue=colors.yellow), | |
allow_flagging="never", | |
) | |
iface.launch() | |