Add dataset script
Browse files- .gitignore +1 -0
- src/dataset_prep.py +33 -0
.gitignore
CHANGED
@@ -1 +1,2 @@
|
|
1 |
.ipynb_checkpoints
|
|
|
|
1 |
.ipynb_checkpoints
|
2 |
+
data
|
src/dataset_prep.py
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tarfile
|
2 |
+
from ast import literal_eval
|
3 |
+
from rich.progress import track
|
4 |
+
from pathlib import Path
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
def tar_file_to_string(filename):
|
8 |
+
with tarfile.open(filename, "r:gz") as tar:
|
9 |
+
for member in tar.getmembers():
|
10 |
+
f = tar.extractfile(member)
|
11 |
+
data = f.readline()
|
12 |
+
data = data.decode("utf-8")
|
13 |
+
data = data.split("{'url'")
|
14 |
+
data = [("{'url'" + item) for item in data]
|
15 |
+
data = data[1:]
|
16 |
+
return data
|
17 |
+
|
18 |
+
if __name__=="__main__":
|
19 |
+
data = Path('../HEAD')
|
20 |
+
for tar_gz in data.iterdir():
|
21 |
+
filename = tar_gz.name.split('.tar.gz')[0]
|
22 |
+
print(f"Now extracting {filename}")
|
23 |
+
text = tar_file_to_string(tar_gz)
|
24 |
+
filtered = []
|
25 |
+
for item in track(text):
|
26 |
+
try:
|
27 |
+
if literal_eval(item)['language_score'] > 0.98:
|
28 |
+
filtered.append(literal_eval(item))
|
29 |
+
except:
|
30 |
+
None
|
31 |
+
filtered = pd.DataFrame(filtered)
|
32 |
+
filtered.to_json(f'../HEAD_CLEAN/{filename}.jsonl', orient='records', lines=True)
|
33 |
+
|