MasaakiKotera commited on
Commit
1b3f51e
·
verified ·
1 Parent(s): 8f79412

Upload tokenization.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. tokenization.py +61 -54
tokenization.py CHANGED
@@ -9,80 +9,87 @@ def parse_arguments():
9
  parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.")
10
  parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.")
11
  parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.")
12
- parser.add_argument("--end_with_eos", type=bool, default=True, help="Whether each line ends with `eos_token`.")
 
 
 
 
13
  return parser.parse_args()
14
 
15
- def shuffle_and_split_data(file_path, split_ratio=0.95):
16
- with open(file_path, 'r', encoding='utf-8') as f:
 
 
 
 
 
17
  lines = f.readlines()
18
-
19
  random.shuffle(lines)
20
  split_at = int(split_ratio * len(lines))
21
- return lines[:split_at], lines[split_at:]
22
-
23
- def write_to_file(file_path, lines):
24
- with open(file_path, 'w', encoding='utf-8') as f:
25
- for line in lines:
26
- f.write(line.replace(' ', ''))
27
-
28
- def tokenize_lines(tokenizer, lines, end_with_eos, block_size = 1e10):
29
- tokenized_ids = []
30
- for i, line in enumerate(lines):
31
- if not end_with_eos:
32
- line = line.strip() + tokenizer.eos_token
33
  ids = tokenizer.encode(line)
 
 
 
 
34
 
35
- #block size limitation
36
- if len(ids) <= block_size-1:
37
- tokenized_ids.extend(ids)
38
-
39
- if (i + 1) % 100000 == 0:
40
- print(f"Processed {i + 1} lines.")
41
- return tokenized_ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  def save_tokenized_data(tokenized_data, file_path):
44
  np_data = np.array(tokenized_data, dtype=np.uint16)
45
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
46
  np_data.tofile(file_path)
47
 
 
 
 
 
 
 
48
  def main():
49
- parser = argparse.ArgumentParser()
50
- parser.add_argument("--data_dir", type=str, required=True, help="Directory of raw data & output files")
51
- parser.add_argument("--file_name", type=str, default="data.txt",required=True)
52
- parser.add_argument("--out_dir", type=str, required=False, help="directory of output files(default=data_dir). A train.bin and a valid.bin will be built and expect to be used in train.py")
53
- parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to a trained AutoTokenizer")
54
- parser.add_argument("--block_size", type=str, required=True, help="Max token length")
55
- args = parser.parse_args()
56
 
57
  # Paths setup
58
- if args.out_dir is None:
59
- out_dir = args.data_dir
60
- else:
61
- out_dir = args.out_dir
62
  raw_data_path = os.path.join(args.data_dir, args.file_name)
63
- train_txt_path = os.path.join(out_dir, 'train.txt')
64
- val_txt_path = os.path.join(out_dir, 'val.txt')
65
- train_bin_path = os.path.join(out_dir, 'train.bin')
66
- val_bin_path = os.path.join(out_dir, 'val.bin')
67
  print("Paths setup complete...")
68
 
69
- # Data preparation
70
- train_lines, val_lines = shuffle_and_split_data(raw_data_path)
71
- write_to_file(train_txt_path, train_lines)
72
- write_to_file(val_txt_path, val_lines)
73
- print("Data preparation complete...")
74
-
75
  # Tokenization
76
- end_with_eos = False
77
  tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
78
- train_ids = tokenize_lines(tokenizer, train_lines, end_with_eos, int(args.block_size))
79
- val_ids = tokenize_lines(tokenizer, val_lines, end_with_eos, int(args.block_size))
80
- print("Tokenization complete...")
81
-
82
- # Save tokenized data
83
- save_tokenized_data(train_ids, train_bin_path)
84
- save_tokenized_data(val_ids, val_bin_path)
85
- print("Tokenized data saved...")
86
 
87
  if __name__ == "__main__":
88
- main()
 
9
  parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.")
10
  parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.")
11
  parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.")
12
+ parser.add_argument("--file_name", type=str, default="data.txt", required=True)
13
+ parser.add_argument("--block_size", type=int, default=512, help="Max token length.")
14
+ parser.add_argument("--is_start_with_eos", type=bool, default=False, help="Whether each line starts with `eos_token`.")
15
+ parser.add_argument("--is_end_with_eos", type=bool, default=False, help="Whether each line ends with `eos_token`.")
16
+ parser.add_argument("--split_ratio", type=float, default=0.99, help="Train-validation split ratio.")
17
  return parser.parse_args()
18
 
19
+ def tokenize_and_save_lines(tokenizer, input_file, train_txt_file, val_txt_file, train_bin_file, val_bin_file,is_start_with_eos, is_end_with_eos, block_size, split_ratio):
20
+ train_ids = []
21
+ val_ids = []
22
+ train_lines = []
23
+ val_lines = []
24
+
25
+ with open(input_file, 'r', encoding='utf-8') as f:
26
  lines = f.readlines()
27
+
28
  random.shuffle(lines)
29
  split_at = int(split_ratio * len(lines))
30
+ train_lines_list = lines[:split_at]
31
+ val_lines_list = lines[split_at:]
32
+
33
+ for i, line in enumerate(train_lines_list):
 
 
 
 
 
 
 
 
34
  ids = tokenizer.encode(line)
35
+ if not is_end_with_eos:
36
+ ids.append(0)
37
+ elif not is_start_with_eos:
38
+ ids.insert(0,0)
39
 
40
+ if len(ids) < block_size:
41
+ train_ids.extend(ids)
42
+ train_lines.append(line.strip())
43
+ if i % 1000000 == 0:
44
+ print(f"now processing {i}...")
45
+
46
+ for i, line in enumerate(val_lines_list):
47
+ ids = tokenizer.encode(line)
48
+ if not is_end_with_eos:
49
+ ids.append(0)
50
+ elif not is_start_with_eos:
51
+ ids.insert(0,0)
52
+
53
+ if len(ids) <= block_size:
54
+ val_ids.extend(ids)
55
+ val_lines.append(line.strip())
56
+
57
+ # Save tokenized data
58
+ save_tokenized_data(train_ids, train_bin_file)
59
+ save_tokenized_data(val_ids, val_bin_file)
60
+ print("Tokenized data saved...")
61
+
62
+ # Save text data
63
+ save_text_data(train_lines, train_txt_file)
64
+ save_text_data(val_lines, val_txt_file)
65
+ print("Text data saved...")
66
 
67
  def save_tokenized_data(tokenized_data, file_path):
68
  np_data = np.array(tokenized_data, dtype=np.uint16)
69
  os.makedirs(os.path.dirname(file_path), exist_ok=True)
70
  np_data.tofile(file_path)
71
 
72
+ def save_text_data(text_data, file_path):
73
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
74
+ with open(file_path, 'w', encoding='utf-8') as f:
75
+ for line in text_data:
76
+ f.write(line + '\n')
77
+
78
  def main():
79
+ args = parse_arguments()
 
 
 
 
 
 
80
 
81
  # Paths setup
 
 
 
 
82
  raw_data_path = os.path.join(args.data_dir, args.file_name)
83
+ train_txt_path = os.path.join(args.out_dir, 'train.txt')
84
+ val_txt_path = os.path.join(args.out_dir, 'val.txt')
85
+ train_bin_path = os.path.join(args.out_dir, 'train.bin')
86
+ val_bin_path = os.path.join(args.out_dir, 'val.bin')
87
  print("Paths setup complete...")
88
 
 
 
 
 
 
 
89
  # Tokenization
 
90
  tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
91
+ tokenize_and_save_lines(tokenizer, raw_data_path, train_txt_path, val_txt_path, train_bin_path, val_bin_path, args.is_start_with_eos, args.is_end_with_eos, args.block_size, args.split_ratio)
92
+ print("Tokenization and data saving")
 
 
 
 
 
 
93
 
94
  if __name__ == "__main__":
95
+ main()