Spaces:
Running
Running
Sanjaya Subedi
commited on
Commit
·
ed813a0
1
Parent(s):
53f48ad
initial
Browse files- README.md +2 -0
- app.py +61 -0
- create_index.py +19 -0
- poetry.lock +0 -0
- pyproject.toml +22 -0
- requirements.txt +3 -0
README.md
CHANGED
@@ -8,6 +8,8 @@ sdk_version: 4.42.0
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
|
|
|
|
11 |
---
|
12 |
|
13 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
license: apache-2.0
|
11 |
+
models:
|
12 |
+
- jangedoo/all-MiniLM-L6-v2-nepali
|
13 |
---
|
14 |
|
15 |
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
app.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import easyknn
|
2 |
+
import gradio as gr
|
3 |
+
import pandas as pd
|
4 |
+
from sentence_transformers import SentenceTransformer
|
5 |
+
|
6 |
+
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
|
7 |
+
knn = easyknn.EasyKNN.load("./data/knn_index")
|
8 |
+
|
9 |
+
|
10 |
+
def search(query: str, k=5):
|
11 |
+
query_embeddings = model.encode(
|
12 |
+
query, normalize_embeddings=True, convert_to_numpy=True
|
13 |
+
)
|
14 |
+
items, scores = knn.neighbors(query_embeddings, k=k)
|
15 |
+
items = [f"{item[:200]} ..." for item in items]
|
16 |
+
return pd.DataFrame(dict(article=items, distance=scores))
|
17 |
+
|
18 |
+
|
19 |
+
def search_duplicate_news(evt: gr.SelectData):
|
20 |
+
return search(evt.row_value[0].replace(" ...", ""), k=5)
|
21 |
+
|
22 |
+
|
23 |
+
with gr.Blocks() as demo:
|
24 |
+
gr.Markdown(
|
25 |
+
"""
|
26 |
+
## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model.
|
27 |
+
|
28 |
+
5,000 Nepali news articles (source dataset: mridul3301/nepali-text-corpus-64) have been embedded using this model.
|
29 |
+
|
30 |
+
FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage.
|
31 |
+
|
32 |
+
You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss.
|
33 |
+
"""
|
34 |
+
)
|
35 |
+
gr.Markdown("Enter a search query and select number of docs you want to return")
|
36 |
+
with gr.Row():
|
37 |
+
query = gr.Textbox(placeholder="query")
|
38 |
+
num_results = gr.Slider(
|
39 |
+
minimum=1, maximum=10, value=5, step=1, label="Number of results"
|
40 |
+
)
|
41 |
+
|
42 |
+
examples = gr.Examples(
|
43 |
+
[
|
44 |
+
"विद्युत् प्राधिकरण",
|
45 |
+
"capital city",
|
46 |
+
"विद्यादेवी भण्डारी",
|
47 |
+
"सवारी दुर्घटना",
|
48 |
+
"वैदेशिक रोजगार",
|
49 |
+
"prime minister",
|
50 |
+
],
|
51 |
+
query,
|
52 |
+
)
|
53 |
+
btn = gr.Button("Search")
|
54 |
+
out = gr.DataFrame(headers=["article", "distance"])
|
55 |
+
gr.Markdown("**Select an article above to see similar articles.**")
|
56 |
+
duplicate_news = gr.DataFrame(headers=["article", "distance"])
|
57 |
+
|
58 |
+
btn.click(fn=search, inputs=[query, num_results], outputs=out)
|
59 |
+
out.select(search_duplicate_news, outputs=duplicate_news)
|
60 |
+
|
61 |
+
demo.launch(debug=True)
|
create_index.py
ADDED
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datasets
|
2 |
+
import easyknn
|
3 |
+
from sentence_transformers import SentenceTransformer
|
4 |
+
|
5 |
+
ds = datasets.load_dataset(
|
6 |
+
"mridul3301/nepali-text-corpus-64", split="train", streaming=True
|
7 |
+
)
|
8 |
+
ds = ds.take(5000)
|
9 |
+
model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
|
10 |
+
|
11 |
+
texts = [row["Article"] for row in ds]
|
12 |
+
embeddings = model.encode(
|
13 |
+
texts, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
|
14 |
+
)
|
15 |
+
|
16 |
+
builder = easyknn.EmbeddingsIndexBuilder()
|
17 |
+
builder.add(embeddings=embeddings, items=texts)
|
18 |
+
knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder)
|
19 |
+
knn.save("./data/knn_index")
|
poetry.lock
ADDED
The diff for this file is too large to render.
See raw diff
|
|
pyproject.toml
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[tool.poetry]
|
2 |
+
name = "nepali-minilm-demo"
|
3 |
+
version = "0.1.0"
|
4 |
+
description = ""
|
5 |
+
authors = ["Sanjaya Subedi <[email protected]>"]
|
6 |
+
readme = "README.md"
|
7 |
+
packages = [{include = "nepali_minilm_demo"}]
|
8 |
+
|
9 |
+
[tool.poetry.dependencies]
|
10 |
+
python = "^3.11"
|
11 |
+
gradio = "^4.42.0"
|
12 |
+
transformers = "^4.44.1"
|
13 |
+
sentence-transformers = "^3.0.1"
|
14 |
+
datasets = "^2.21.0"
|
15 |
+
faiss-cpu = "^1.8.0.post1"
|
16 |
+
torch = "2.1"
|
17 |
+
easyknn = "^0.4.1"
|
18 |
+
|
19 |
+
|
20 |
+
[build-system]
|
21 |
+
requires = ["poetry-core"]
|
22 |
+
build-backend = "poetry.core.masonry.api"
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
easyknn==0.4.1
|
2 |
+
faiss-cpu==1.8.0.post1
|
3 |
+
sentence-transformers==3.0.1
|