cakiki commited on
Commit
8a3d76c
·
1 Parent(s): 6e44597

Add dataset script

Browse files
Files changed (2) hide show
  1. .gitignore +1 -0
  2. src/dataset_prep.py +33 -0
.gitignore CHANGED
@@ -1 +1,2 @@
1
  .ipynb_checkpoints
 
 
1
  .ipynb_checkpoints
2
+ data
src/dataset_prep.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tarfile
2
+ from ast import literal_eval
3
+ from rich.progress import track
4
+ from pathlib import Path
5
+ import pandas as pd
6
+
7
+ def tar_file_to_string(filename):
8
+ with tarfile.open(filename, "r:gz") as tar:
9
+ for member in tar.getmembers():
10
+ f = tar.extractfile(member)
11
+ data = f.readline()
12
+ data = data.decode("utf-8")
13
+ data = data.split("{'url'")
14
+ data = [("{'url'" + item) for item in data]
15
+ data = data[1:]
16
+ return data
17
+
18
+ if __name__=="__main__":
19
+ data = Path('../HEAD')
20
+ for tar_gz in data.iterdir():
21
+ filename = tar_gz.name.split('.tar.gz')[0]
22
+ print(f"Now extracting {filename}")
23
+ text = tar_file_to_string(tar_gz)
24
+ filtered = []
25
+ for item in track(text):
26
+ try:
27
+ if literal_eval(item)['language_score'] > 0.98:
28
+ filtered.append(literal_eval(item))
29
+ except:
30
+ None
31
+ filtered = pd.DataFrame(filtered)
32
+ filtered.to_json(f'../HEAD_CLEAN/{filename}.jsonl', orient='records', lines=True)
33
+