import easyknn import gradio as gr import pandas as pd from sentence_transformers import SentenceTransformer model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali") knn = easyknn.EasyKNN.load("./data/knn_index") def search(query: str, k=5): query_embeddings = model.encode( query, normalize_embeddings=True, convert_to_numpy=True ) items, scores = knn.neighbors(query_embeddings, k=k) items = [f"{item[:200]} ..." for item in items] return pd.DataFrame(dict(article=items, distance=scores)) def search_duplicate_news(evt: gr.SelectData): return search(evt.row_value[0].replace(" ...", ""), k=5) with gr.Blocks() as demo: gr.Markdown( """ ## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model. 5,000 Nepali news articles (source dataset: mridul3301/nepali-text-corpus-64) have been embedded using this model. FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage. You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss. """ ) gr.Markdown("Enter a search query and select number of docs you want to return") with gr.Row(): query = gr.Textbox(placeholder="query") num_results = gr.Slider( minimum=1, maximum=10, value=5, step=1, label="Number of results" ) examples = gr.Examples( [ "विद्युत् प्राधिकरण", "capital city", "विद्यादेवी भण्डारी", "सवारी दुर्घटना", "वैदेशिक रोजगार", "prime minister", ], query, ) btn = gr.Button("Search") out = gr.DataFrame(headers=["article", "distance"]) gr.Markdown("**Select an article above to see similar articles.**") duplicate_news = gr.DataFrame(headers=["article", "distance"]) btn.click(fn=search, inputs=[query, num_results], outputs=out) out.select(search_duplicate_news, outputs=duplicate_news) demo.launch(debug=True)