|
from src.utils.mapper import configmapper |
|
from transformers import AutoTokenizer |
|
import pandas as pd |
|
from datasets import load_dataset, Dataset |
|
from evaluation.fix_spans import _contiguous_ranges |
|
|
|
|
|
@configmapper.map("datasets", "toxic_spans_multi_spans") |
|
class ToxicSpansMultiSpansDataset: |
|
def __init__(self, config): |
|
self.config = config |
|
self.tokenizer = AutoTokenizer.from_pretrained( |
|
self.config.model_checkpoint_name |
|
) |
|
|
|
self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) |
|
self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) |
|
|
|
temp_key_train = list(self.dataset.keys())[0] |
|
self.intermediate_dataset = self.dataset.map( |
|
self.create_train_features, |
|
batched=True, |
|
batch_size=1000000, |
|
remove_columns=self.dataset[temp_key_train].column_names, |
|
) |
|
|
|
temp_key_test = list(self.test_dataset.keys())[0] |
|
self.intermediate_test_dataset = self.test_dataset.map( |
|
self.create_test_features, |
|
batched=True, |
|
batch_size=1000000, |
|
remove_columns=self.test_dataset[temp_key_test].column_names, |
|
) |
|
|
|
self.tokenized_inputs = self.intermediate_dataset.map( |
|
self.prepare_train_features, |
|
batched=True, |
|
remove_columns=self.intermediate_dataset[temp_key_train].column_names, |
|
) |
|
self.test_tokenized_inputs = self.intermediate_test_dataset.map( |
|
self.prepare_test_features, |
|
batched=True, |
|
remove_columns=self.intermediate_test_dataset[temp_key_test].column_names, |
|
) |
|
|
|
def create_train_features(self, examples): |
|
features = { |
|
"context": [], |
|
"id": [], |
|
"question": [], |
|
"title": [], |
|
"start_positions": [], |
|
"end_positions": [], |
|
} |
|
id = 0 |
|
|
|
for row_number in range(len(examples["text"])): |
|
context = examples["text"][row_number] |
|
question = "offense" |
|
title = context.split(" ")[0] |
|
start_positions = [] |
|
end_positions = [] |
|
span = eval(examples["spans"][row_number]) |
|
contiguous_spans = _contiguous_ranges(span) |
|
for lst in contiguous_spans: |
|
lst = list(lst) |
|
dict_to_write = {} |
|
|
|
start_positions.append(lst[0]) |
|
end_positions.append(lst[1]) |
|
|
|
features["context"].append(context) |
|
features["id"].append(str(id)) |
|
features["question"].append(question) |
|
features["title"].append(title) |
|
features["start_positions"].append(start_positions) |
|
features["end_positions"].append(end_positions) |
|
id += 1 |
|
|
|
return features |
|
|
|
def create_test_features(self, examples): |
|
features = {"context": [], "id": [], "question": [], "title": []} |
|
id = 0 |
|
for row_number in range(len(examples["text"])): |
|
context = examples["text"][row_number] |
|
question = "offense" |
|
title = context.split(" ")[0] |
|
features["context"].append(context) |
|
features["id"].append(str(id)) |
|
features["question"].append(question) |
|
features["title"].append(title) |
|
id += 1 |
|
return features |
|
|
|
def prepare_train_features(self, examples): |
|
"""Generate tokenized features from examples. |
|
|
|
Args: |
|
examples (dict): The examples to be tokenized. |
|
|
|
Returns: |
|
transformers.tokenization_utils_base.BatchEncoding: |
|
The tokenized features/examples after processing. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
pad_on_right = self.tokenizer.padding_side == "right" |
|
print("### Batch Tokenizing Examples ###") |
|
tokenized_examples = self.tokenizer( |
|
examples["question" if pad_on_right else "context"], |
|
examples["context" if pad_on_right else "question"], |
|
**dict(self.config.tokenizer_params), |
|
) |
|
|
|
|
|
|
|
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") |
|
|
|
|
|
|
|
offset_mapping = tokenized_examples.pop("offset_mapping") |
|
|
|
|
|
tokenized_examples["start_positions"] = [] |
|
tokenized_examples["end_positions"] = [] |
|
|
|
for i, offsets in enumerate(offset_mapping): |
|
|
|
input_ids = tokenized_examples["input_ids"][i] |
|
|
|
|
|
|
|
sequence_ids = tokenized_examples.sequence_ids(i) |
|
|
|
|
|
|
|
sample_index = sample_mapping[i] |
|
start_positions = examples["start_positions"][sample_index] |
|
end_positions = examples["end_positions"][sample_index] |
|
|
|
start_positions_token_wise = [0 for x in range(len(input_ids))] |
|
end_positions_token_wise = [0 for x in range(len(input_ids))] |
|
|
|
if len(start_positions) != 0: |
|
for position in range(len(start_positions)): |
|
start_char = start_positions[position] |
|
end_char = end_positions[position] + 1 |
|
|
|
|
|
token_start_index = 0 |
|
while sequence_ids[token_start_index] != (1 if pad_on_right else 0): |
|
token_start_index += 1 |
|
|
|
|
|
token_end_index = len(input_ids) - 1 |
|
while sequence_ids[token_end_index] != (1 if pad_on_right else 0): |
|
token_end_index -= 1 |
|
|
|
|
|
if not ( |
|
offsets[token_start_index][0] <= start_char |
|
and offsets[token_end_index][1] >= end_char |
|
): |
|
continue |
|
else: |
|
|
|
|
|
while ( |
|
token_start_index < len(offsets) |
|
and offsets[token_start_index][0] <= start_char |
|
): |
|
token_start_index += 1 |
|
start_positions_token_wise[token_start_index - 1] = 1 |
|
while offsets[token_end_index][1] >= end_char: |
|
token_end_index -= 1 |
|
end_positions_token_wise[token_end_index + 1] = 1 |
|
tokenized_examples["start_positions"].append(start_positions_token_wise) |
|
tokenized_examples["end_positions"].append(start_positions_token_wise) |
|
return tokenized_examples |
|
|
|
def prepare_test_features(self, examples): |
|
|
|
"""Generate tokenized validation features from examples. |
|
|
|
Args: |
|
examples (dict): The validation examples to be tokenized. |
|
|
|
Returns: |
|
transformers.tokenization_utils_base.BatchEncoding: |
|
The tokenized features/examples for validation set after processing. |
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
print("### Tokenizing Validation Examples") |
|
pad_on_right = self.tokenizer.padding_side == "right" |
|
tokenized_examples = self.tokenizer( |
|
examples["question" if pad_on_right else "context"], |
|
examples["context" if pad_on_right else "question"], |
|
**dict(self.config.tokenizer_params), |
|
) |
|
|
|
|
|
|
|
sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") |
|
|
|
|
|
tokenized_examples["example_id"] = [] |
|
|
|
for i in range(len(tokenized_examples["input_ids"])): |
|
|
|
|
|
sequence_ids = tokenized_examples.sequence_ids(i) |
|
context_index = 1 if pad_on_right else 0 |
|
|
|
|
|
|
|
sample_index = sample_mapping[i] |
|
tokenized_examples["example_id"].append(str(examples["id"][sample_index])) |
|
|
|
|
|
|
|
|
|
tokenized_examples["offset_mapping"][i] = [ |
|
(o if sequence_ids[k] == context_index else None) |
|
for k, o in enumerate(tokenized_examples["offset_mapping"][i]) |
|
] |
|
|
|
return tokenized_examples |
|
|