Sanjaya Subedi commited on
Commit
ed813a0
·
1 Parent(s): 53f48ad
Files changed (6) hide show
  1. README.md +2 -0
  2. app.py +61 -0
  3. create_index.py +19 -0
  4. poetry.lock +0 -0
  5. pyproject.toml +22 -0
  6. requirements.txt +3 -0
README.md CHANGED
@@ -8,6 +8,8 @@ sdk_version: 4.42.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
 
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ models:
12
+ - jangedoo/all-MiniLM-L6-v2-nepali
13
  ---
14
 
15
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import easyknn
2
+ import gradio as gr
3
+ import pandas as pd
4
+ from sentence_transformers import SentenceTransformer
5
+
6
+ model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
7
+ knn = easyknn.EasyKNN.load("./data/knn_index")
8
+
9
+
10
+ def search(query: str, k=5):
11
+ query_embeddings = model.encode(
12
+ query, normalize_embeddings=True, convert_to_numpy=True
13
+ )
14
+ items, scores = knn.neighbors(query_embeddings, k=k)
15
+ items = [f"{item[:200]} ..." for item in items]
16
+ return pd.DataFrame(dict(article=items, distance=scores))
17
+
18
+
19
+ def search_duplicate_news(evt: gr.SelectData):
20
+ return search(evt.row_value[0].replace(" ...", ""), k=5)
21
+
22
+
23
+ with gr.Blocks() as demo:
24
+ gr.Markdown(
25
+ """
26
+ ## Demo of [jangedoo/all-MiniLM-L6-v2-nepali](https://huggingface.co/jangedoo/all-MiniLM-L6-v2-nepali) model.
27
+
28
+ 5,000 Nepali news articles (source dataset: mridul3301/nepali-text-corpus-64) have been embedded using this model.
29
+
30
+ FAISS library is used for similarity search and the embeddings have been quantized to 8bit integers to tradeoff performance vs resource usage.
31
+
32
+ You can use **Nepali** as well as **English** for your queries. However, English queries are kind of hit-and-miss.
33
+ """
34
+ )
35
+ gr.Markdown("Enter a search query and select number of docs you want to return")
36
+ with gr.Row():
37
+ query = gr.Textbox(placeholder="query")
38
+ num_results = gr.Slider(
39
+ minimum=1, maximum=10, value=5, step=1, label="Number of results"
40
+ )
41
+
42
+ examples = gr.Examples(
43
+ [
44
+ "विद्युत् प्राधिकरण",
45
+ "capital city",
46
+ "विद्यादेवी भण्डारी",
47
+ "सवारी दुर्घटना",
48
+ "वैदेशिक रोजगार",
49
+ "prime minister",
50
+ ],
51
+ query,
52
+ )
53
+ btn = gr.Button("Search")
54
+ out = gr.DataFrame(headers=["article", "distance"])
55
+ gr.Markdown("**Select an article above to see similar articles.**")
56
+ duplicate_news = gr.DataFrame(headers=["article", "distance"])
57
+
58
+ btn.click(fn=search, inputs=[query, num_results], outputs=out)
59
+ out.select(search_duplicate_news, outputs=duplicate_news)
60
+
61
+ demo.launch(debug=True)
create_index.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datasets
2
+ import easyknn
3
+ from sentence_transformers import SentenceTransformer
4
+
5
+ ds = datasets.load_dataset(
6
+ "mridul3301/nepali-text-corpus-64", split="train", streaming=True
7
+ )
8
+ ds = ds.take(5000)
9
+ model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali")
10
+
11
+ texts = [row["Article"] for row in ds]
12
+ embeddings = model.encode(
13
+ texts, normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True
14
+ )
15
+
16
+ builder = easyknn.EmbeddingsIndexBuilder()
17
+ builder.add(embeddings=embeddings, items=texts)
18
+ knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder)
19
+ knn.save("./data/knn_index")
poetry.lock ADDED
The diff for this file is too large to render. See raw diff
 
pyproject.toml ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [tool.poetry]
2
+ name = "nepali-minilm-demo"
3
+ version = "0.1.0"
4
+ description = ""
5
+ authors = ["Sanjaya Subedi <[email protected]>"]
6
+ readme = "README.md"
7
+ packages = [{include = "nepali_minilm_demo"}]
8
+
9
+ [tool.poetry.dependencies]
10
+ python = "^3.11"
11
+ gradio = "^4.42.0"
12
+ transformers = "^4.44.1"
13
+ sentence-transformers = "^3.0.1"
14
+ datasets = "^2.21.0"
15
+ faiss-cpu = "^1.8.0.post1"
16
+ torch = "2.1"
17
+ easyknn = "^0.4.1"
18
+
19
+
20
+ [build-system]
21
+ requires = ["poetry-core"]
22
+ build-backend = "poetry.core.masonry.api"
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ easyknn==0.4.1
2
+ faiss-cpu==1.8.0.post1
3
+ sentence-transformers==3.0.1