Spaces:

Saiteja
/

telugu-bpe

Sleeping

App Files Files Community

telugu-bpe / test.py

Saiteja

Upload 4 files

a35bc8f verified about 1 month ago

raw

history blame contribute delete

3 kB

	import gradio as gr
	from tokenizers import Tokenizer
	import json
	from huggingface_hub import hf_hub_download
	import os

	# Download tokenizer files from HF Hub
	def get_tokenizer():
	try:
	# Download tokenizer.json
	tokenizer_path = hf_hub_download(
	repo_id="Saiteja/telugu-bpe",
	filename="tokenizer.json",
	repo_type="model"
	)
	# Download examples.json
	examples_path = hf_hub_download(
	repo_id="Saiteja/telugu-bpe",
	filename="examples.json",
	repo_type="model"
	)
	return tokenizer_path, examples_path
	except Exception as e:
	print(f"Error downloading files: {e}")
	return None, None

	# Get tokenizer and examples
	tokenizer_path, examples_path = get_tokenizer()

	# Load the tokenizer
	tokenizer = Tokenizer.from_file(tokenizer_path)

	# Load examples
	with open(examples_path, "r", encoding="utf-8") as f:
	examples_data = json.load(f)

	# Extract example texts
	# example_texts = [
	# "నమస్కారం", # Hello
	# "తెలుగు భాష చాలా అందమైనది", # Telugu language is very beautiful
	# "భారతదేశం నా దేశం", # India is my country
	# "తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", # Telugu literature is very rich
	# "నేను తెలుగు భాషను ప్రేమిస్తున్నాను" # I love Telugu language
	# ]
	example_texts = [example["text"] for example in examples_data]

	def tokenize_text(text):
	"""Tokenize the input text and return tokens, ids and compression ratio."""
	if not text.strip():
	return "Please enter some text."

	try:
	encoding = tokenizer.encode(text)
	compression_ratio = len(text) / len(encoding.ids)

	result = f"""Tokens: {encoding.tokens}
	Token IDs: {encoding.ids}
	Number of tokens: {len(encoding.ids)}
	Text length: {len(text)}
	Compression ratio: {compression_ratio:.2f}"""

	return result
	except Exception as e:
	return f"Error: {str(e)}"

	# Create the Gradio interface
	iface = gr.Interface(
	fn=tokenize_text,
	inputs=gr.Textbox(
	lines=5,
	placeholder="Enter Telugu text here...",
	label="Input Text"
	),
	outputs=gr.Textbox(
	label="Tokenization Results",
	lines=10
	),
	title="Telugu Tokenizer Demo",
	description="""This demo uses a custom Telugu tokenizer trained on a large corpus of Telugu text.
	The tokenizer has a vocabulary size of 50,000+ tokens and achieves a compression ratio of >3.0.
	Try entering some Telugu text to see how it's tokenized!

	Tokenizer: https://huggingface.co/Saiteja/telugu-bpe""",
	examples=example_texts,
	theme=gr.themes.Soft()
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()