|
from transformers import AutoTokenizer, AutoModelForMaskedLM |
|
import torch |
|
import gradio as gr |
|
import numpy as np |
|
import json |
|
|
|
class BertEmbeddingsGenerator: |
|
def __init__(self, model_name="tahrirchi/tahrirchi-bert-base"): |
|
"""Initialize the BERT model and tokenizer.""" |
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
self.model = AutoModelForMaskedLM.from_pretrained(model_name) |
|
self.model.eval() |
|
|
|
def get_embeddings(self, text): |
|
""" |
|
Generate embeddings for the input text. |
|
|
|
Args: |
|
text (str): Input text to embed |
|
|
|
Returns: |
|
np.ndarray: Text embeddings |
|
""" |
|
|
|
inputs = self.tokenizer( |
|
text, |
|
return_tensors="pt", |
|
truncation=True, |
|
padding=True, |
|
max_length=512 |
|
) |
|
|
|
|
|
with torch.no_grad(): |
|
outputs = self.model(**inputs, output_hidden_states=True) |
|
|
|
|
|
|
|
last_hidden_state = outputs.hidden_states[-1] |
|
|
|
|
|
embeddings = last_hidden_state.mean(dim=1) |
|
|
|
|
|
return embeddings.squeeze().cpu().numpy() |
|
|
|
def create_gradio_interface(): |
|
"""Create and configure the Gradio interface.""" |
|
|
|
generator = BertEmbeddingsGenerator() |
|
|
|
def embed_text(input_text): |
|
"""Gradio interface function.""" |
|
try: |
|
if not input_text or not input_text.strip(): |
|
return json.dumps({"error": "Matn kiritilmadi"}) |
|
|
|
embeddings = generator.get_embeddings(input_text) |
|
|
|
|
|
embeddings_list = np.where(np.isfinite(embeddings), embeddings, None).tolist() |
|
|
|
|
|
output = { |
|
"embeddings": embeddings_list, |
|
"dimensions": len(embeddings_list), |
|
"status": "success" |
|
} |
|
|
|
return json.dumps(output, ensure_ascii=False) |
|
|
|
except Exception as e: |
|
return json.dumps({ |
|
"error": str(e), |
|
"status": "error" |
|
}) |
|
|
|
|
|
iface = gr.Interface( |
|
fn=embed_text, |
|
inputs=gr.Textbox( |
|
lines=2, |
|
placeholder="Matn kiriting...", |
|
label="Input Text" |
|
), |
|
outputs=gr.JSON(label="Embeddings"), |
|
title="O'zbek tili uchun embedding", |
|
description="O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish", |
|
examples=[ |
|
["Assalomu alaykum, men o'zbek tili bilan ishlayman"], |
|
["O'zbek tili uchun Tahrirchi BERT Base modeli orqali embedding generatsiya qilish uchun namuna matn."] |
|
] |
|
) |
|
return iface |
|
|
|
if __name__ == "__main__": |
|
|
|
iface = create_gradio_interface() |
|
iface.launch() |