mrmuminov's picture
Locale
809fdaf
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
import gradio as gr
import numpy as np
import json
class BertEmbeddingsGenerator:
def __init__(self, model_name="tahrirchi/tahrirchi-bert-base"):
"""Initialize the BERT model and tokenizer."""
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForMaskedLM.from_pretrained(model_name)
self.model.eval() # Set to evaluation mode
def get_embeddings(self, text):
"""
Generate embeddings for the input text.
Args:
text (str): Input text to embed
Returns:
np.ndarray: Text embeddings
"""
# Tokenize input text
inputs = self.tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=512
)
# Generate embeddings
with torch.no_grad():
outputs = self.model(**inputs, output_hidden_states=True)
# Get the hidden states from the last layer
# The hidden states tuple contains embeddings from all layers, -1 gets the last layer
last_hidden_state = outputs.hidden_states[-1]
# Average token embeddings to get sentence embedding
embeddings = last_hidden_state.mean(dim=1)
# Convert to numpy and then to list
return embeddings.squeeze().cpu().numpy()
def create_gradio_interface():
"""Create and configure the Gradio interface."""
# Initialize the embeddings generator
generator = BertEmbeddingsGenerator()
def embed_text(input_text):
"""Gradio interface function."""
try:
if not input_text or not input_text.strip():
return json.dumps({"error": "Matn kiritilmadi"})
embeddings = generator.get_embeddings(input_text)
# Convert numpy array to list and handle NaN/Infinity values
embeddings_list = np.where(np.isfinite(embeddings), embeddings, None).tolist()
# Create a structured output
output = {
"embeddings": embeddings_list,
"dimensions": len(embeddings_list),
"status": "success"
}
return json.dumps(output, ensure_ascii=False)
except Exception as e:
return json.dumps({
"error": str(e),
"status": "error"
})
# Create Gradio interface
iface = gr.Interface(
fn=embed_text,
inputs=gr.Textbox(
lines=2,
placeholder="Matn kiriting...",
label="Input Text"
),
outputs=gr.JSON(label="Embeddings"),
title="O'zbek tili uchun embedding",
description="O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish",
examples=[
["Assalomu alaykum, men o'zbek tili bilan ishlayman"],
["O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish uchun namuna matn."]
]
)
return iface
if __name__ == "__main__":
# Create and launch the interface
iface = create_gradio_interface()
iface.launch()