import datasets import easyknn from sentence_transformers import SentenceTransformer ds = datasets.load_dataset( "wikimedia/wikipedia", "20231101.ne", split="train", streaming=True ) ds = list(ds.take(5000)) model = SentenceTransformer("jangedoo/all-MiniLM-L6-v2-nepali") texts = [row["text"] for row in ds] urls = [row["url"] for row in ds] embeddings = model.encode( [text[:1500] for text in texts], normalize_embeddings=True, convert_to_numpy=True, show_progress_bar=True, ) builder = easyknn.EmbeddingsIndexBuilder() builder.add(embeddings=embeddings, item_keys=urls, items=ds) knn = easyknn.EasyKNN.from_builder_with_faiss(builder=builder) knn.save("./data/knn_index")