from src.utils.mapper import configmapper from transformers import AutoTokenizer import pandas as pd from datasets import load_dataset, Dataset from evaluation.fix_spans import _contiguous_ranges @configmapper.map("datasets", "toxic_spans_multi_spans") class ToxicSpansMultiSpansDataset: def __init__(self, config): self.config = config self.tokenizer = AutoTokenizer.from_pretrained( self.config.model_checkpoint_name ) self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) temp_key_train = list(self.dataset.keys())[0] self.intermediate_dataset = self.dataset.map( self.create_train_features, batched=True, batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping remove_columns=self.dataset[temp_key_train].column_names, ) temp_key_test = list(self.test_dataset.keys())[0] self.intermediate_test_dataset = self.test_dataset.map( self.create_test_features, batched=True, batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping remove_columns=self.test_dataset[temp_key_test].column_names, ) self.tokenized_inputs = self.intermediate_dataset.map( self.prepare_train_features, batched=True, remove_columns=self.intermediate_dataset[temp_key_train].column_names, ) self.test_tokenized_inputs = self.intermediate_test_dataset.map( self.prepare_test_features, batched=True, remove_columns=self.intermediate_test_dataset[temp_key_test].column_names, ) def create_train_features(self, examples): features = { "context": [], "id": [], "question": [], "title": [], "start_positions": [], "end_positions": [], } id = 0 # print(examples) for row_number in range(len(examples["text"])): context = examples["text"][row_number] question = "offense" title = context.split(" ")[0] start_positions = [] end_positions = [] span = eval(examples["spans"][row_number]) contiguous_spans = _contiguous_ranges(span) for lst in contiguous_spans: lst = list(lst) dict_to_write = {} start_positions.append(lst[0]) end_positions.append(lst[1]) features["context"].append(context) features["id"].append(str(id)) features["question"].append(question) features["title"].append(title) features["start_positions"].append(start_positions) features["end_positions"].append(end_positions) id += 1 return features def create_test_features(self, examples): features = {"context": [], "id": [], "question": [], "title": []} id = 0 for row_number in range(len(examples["text"])): context = examples["text"][row_number] question = "offense" title = context.split(" ")[0] features["context"].append(context) features["id"].append(str(id)) features["question"].append(question) features["title"].append(title) id += 1 return features def prepare_train_features(self, examples): """Generate tokenized features from examples. Args: examples (dict): The examples to be tokenized. Returns: transformers.tokenization_utils_base.BatchEncoding: The tokenized features/examples after processing. """ # Tokenize our examples with truncation and padding, but keep the # overflows using a stride. This results in one example possible # giving several features when a context is long, each of those # features having a context that overlaps a bit the context # of the previous feature. pad_on_right = self.tokenizer.padding_side == "right" print("### Batch Tokenizing Examples ###") tokenized_examples = self.tokenizer( examples["question" if pad_on_right else "context"], examples["context" if pad_on_right else "question"], **dict(self.config.tokenizer_params), ) # Since one example might give us several features if it has # a long context, we need a map from a feature to # its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # The offset mappings will give us a map from token to # character position in the original context. This will # help us compute the start_positions and end_positions. offset_mapping = tokenized_examples.pop("offset_mapping") # Let's label those examples! tokenized_examples["start_positions"] = [] tokenized_examples["end_positions"] = [] for i, offsets in enumerate(offset_mapping): # We will label impossible answers with the index of the CLS token. input_ids = tokenized_examples["input_ids"][i] # Grab the sequence corresponding to that example # (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) # One example can give several spans, this is the index of # the example containing this span of text. sample_index = sample_mapping[i] start_positions = examples["start_positions"][sample_index] end_positions = examples["end_positions"][sample_index] start_positions_token_wise = [0 for x in range(len(input_ids))] end_positions_token_wise = [0 for x in range(len(input_ids))] # If no answers are given, set the cls_index as answer. if len(start_positions) != 0: for position in range(len(start_positions)): start_char = start_positions[position] end_char = end_positions[position] + 1 # Start token index of the current span in the text. token_start_index = 0 while sequence_ids[token_start_index] != (1 if pad_on_right else 0): token_start_index += 1 # End token index of the current span in the text. token_end_index = len(input_ids) - 1 while sequence_ids[token_end_index] != (1 if pad_on_right else 0): token_end_index -= 1 # Detect if the answer is out of the span (in which case we continue). if not ( offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char ): continue else: # Otherwise move the token_start_index and token_end_index to the two ends of the answer. # Note: we could go after the last offset if the answer is the last word (edge case). while ( token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char ): token_start_index += 1 start_positions_token_wise[token_start_index - 1] = 1 while offsets[token_end_index][1] >= end_char: token_end_index -= 1 end_positions_token_wise[token_end_index + 1] = 1 tokenized_examples["start_positions"].append(start_positions_token_wise) tokenized_examples["end_positions"].append(start_positions_token_wise) return tokenized_examples def prepare_test_features(self, examples): """Generate tokenized validation features from examples. Args: examples (dict): The validation examples to be tokenized. Returns: transformers.tokenization_utils_base.BatchEncoding: The tokenized features/examples for validation set after processing. """ # Tokenize our examples with truncation and maybe # padding, but keep the overflows using a stride. # This results in one example possible giving several features # when a context is long, each of those features having a # context that overlaps a bit the context of the previous feature. print("### Tokenizing Validation Examples") pad_on_right = self.tokenizer.padding_side == "right" tokenized_examples = self.tokenizer( examples["question" if pad_on_right else "context"], examples["context" if pad_on_right else "question"], **dict(self.config.tokenizer_params), ) # Since one example might give us several features if it has a long context, # we need a map from a feature to its corresponding example. This key gives us just that. sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") # We keep the example_id that gave us this feature and we will store the offset mappings. tokenized_examples["example_id"] = [] for i in range(len(tokenized_examples["input_ids"])): # Grab the sequence corresponding to that example # (to know what is the context and what is the question). sequence_ids = tokenized_examples.sequence_ids(i) context_index = 1 if pad_on_right else 0 # One example can give several spans, # this is the index of the example containing this span of text. sample_index = sample_mapping[i] tokenized_examples["example_id"].append(str(examples["id"][sample_index])) # Set to None the offset_mapping that are not part # of the context so it's easy to determine if a token # position is part of the context or not. tokenized_examples["offset_mapping"][i] = [ (o if sequence_ids[k] == context_index else None) for k, o in enumerate(tokenized_examples["offset_mapping"][i]) ] return tokenized_examples