File size: 3,619 Bytes
508087f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import os
import numpy as np
from transformers import AutoTokenizer
import random
import argparse
def parse_arguments():
parser = argparse.ArgumentParser(description='Process the text data for tokenization.')
parser.add_argument("--data_dir", type=str, required=True, help="Directory of the raw data.")
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to the trained AutoTokenizer.")
parser.add_argument("--out_dir", type=str, required=True, help="Directory of output files.")
parser.add_argument("--end_with_eos", type=bool, default=True, help="Whether each line ends with `eos_token`.")
return parser.parse_args()
def shuffle_and_split_data(file_path, split_ratio=0.95):
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
random.shuffle(lines)
split_at = int(split_ratio * len(lines))
return lines[:split_at], lines[split_at:]
def write_to_file(file_path, lines):
with open(file_path, 'w', encoding='utf-8') as f:
for line in lines:
f.write(line.replace(' ', ''))
def tokenize_lines(tokenizer, lines, end_with_eos, block_size = 1e10):
tokenized_ids = []
for i, line in enumerate(lines):
if not end_with_eos:
line = line.strip() + tokenizer.eos_token
ids = tokenizer.encode(line)
#block size limitation
if len(ids) <= block_size-1:
tokenized_ids.extend(ids)
if (i + 1) % 100000 == 0:
print(f"Processed {i + 1} lines.")
return tokenized_ids
def save_tokenized_data(tokenized_data, file_path):
np_data = np.array(tokenized_data, dtype=np.uint16)
os.makedirs(os.path.dirname(file_path), exist_ok=True)
np_data.tofile(file_path)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--data_dir", type=str, required=True, help="Directory of raw data & output files")
parser.add_argument("--file_name", type=str, default="data.txt",required=True)
parser.add_argument("--out_dir", type=str, required=False, help="directory of output files(default=data_dir). A train.bin and a valid.bin will be built and expect to be used in train.py")
parser.add_argument("--tokenizer_path", type=str, required=True, help="Path to a trained AutoTokenizer")
parser.add_argument("--block_size", type=str, required=True, help="Max token length")
args = parser.parse_args()
# Paths setup
if args.out_dir is None:
out_dir = args.data_dir
else:
out_dir = args.out_dir
raw_data_path = os.path.join(args.data_dir, args.file_name)
train_txt_path = os.path.join(out_dir, 'train.txt')
val_txt_path = os.path.join(out_dir, 'val.txt')
train_bin_path = os.path.join(out_dir, 'train.bin')
val_bin_path = os.path.join(out_dir, 'val.bin')
print("Paths setup complete...")
# Data preparation
train_lines, val_lines = shuffle_and_split_data(raw_data_path)
write_to_file(train_txt_path, train_lines)
write_to_file(val_txt_path, val_lines)
print("Data preparation complete...")
# Tokenization
end_with_eos = False
tokenizer = AutoTokenizer.from_pretrained(args.tokenizer_path)
train_ids = tokenize_lines(tokenizer, train_lines, end_with_eos, int(args.block_size))
val_ids = tokenize_lines(tokenizer, val_lines, end_with_eos, int(args.block_size))
print("Tokenization complete...")
# Save tokenized data
save_tokenized_data(train_ids, train_bin_path)
save_tokenized_data(val_ids, val_bin_path)
print("Tokenized data saved...")
if __name__ == "__main__":
main()
|