Spaces:
Sleeping
Sleeping
Commit
·
9854175
1
Parent(s):
e30de0d
Modifing code of dataset file.
Browse files- my_dataset.py +14 -2
my_dataset.py
CHANGED
@@ -2,8 +2,20 @@
|
|
2 |
load dataset:
|
3 |
https://huggingface.co/docs/datasets/loading#hugging-face-hub
|
4 |
'''
|
|
|
|
|
|
|
5 |
from datasets import load_dataset
|
6 |
import faiss
|
7 |
-
|
8 |
datasetx = load_dataset("JosueElias/pipeline_dataset2")
|
9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
load dataset:
|
3 |
https://huggingface.co/docs/datasets/loading#hugging-face-hub
|
4 |
'''
|
5 |
+
|
6 |
+
from datasets import load_from_disk, Dataset
|
7 |
+
from huggingface_hub import hf_hub_download
|
8 |
from datasets import load_dataset
|
9 |
import faiss
|
10 |
+
# load wikipedia dataset
|
11 |
datasetx = load_dataset("JosueElias/pipeline_dataset2")
|
12 |
+
# load faiss file and get route of file
|
13 |
+
path2 = hf_hub_download(repo_id="JosueElias/pipeline_faiss", filename="faiss.index", repo_type="dataset")
|
14 |
+
# save wikipedia dataset locally
|
15 |
+
datasetx.save_to_disk("./directory")
|
16 |
+
# delete variable to have more memory space
|
17 |
+
del datasetx
|
18 |
+
# load dataset again in arrow format
|
19 |
+
datasetx = load_from_disk("./directory/train")
|
20 |
+
# load faiss to dataset
|
21 |
+
datasetx.load_faiss_index('embeddings', path2)
|