diff --git a/src/datasets/__init__.py b/src/datasets/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..446a3d278b7f86d608e6ff6c85dae19b2ebbe222 --- /dev/null +++ b/src/datasets/__init__.py @@ -0,0 +1,7 @@ +from src.datasets.toxic_spans_tokens import * +from src.datasets.toxic_spans_tokens_3cls import * +from src.datasets.toxic_spans_spans import * +from src.datasets.toxic_spans_tokens_spans import * +from src.datasets.toxic_spans_multi_spans import * +from src.datasets.toxic_spans_crf_tokens import * +from src.datasets.toxic_spans_crf_3cls_tokens import * \ No newline at end of file diff --git a/src/datasets/__pycache__/__init__.cpython-38.pyc b/src/datasets/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..02e06f5d9e62aa916dc0e261dc691618b2f66182 Binary files /dev/null and b/src/datasets/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_crf_3cls_tokens.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_crf_3cls_tokens.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..370e8050d79e33e517e81571457655bc88984f49 Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_crf_3cls_tokens.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_crf_tokens.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_crf_tokens.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8a81958897e7fbad60c3e7865e91a342bc6d5511 Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_crf_tokens.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_multi_spans.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_multi_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..997f999d2c6507e501cd7240f5896ebb8e6d44e7 Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_multi_spans.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_spans.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ebf1e3e783a8f886b92c6a51d27a2b1c0327ee38 Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_spans.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_tokens.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_tokens.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..109c2cc9ce22c4ca60c4602054d3fbd6c50fcccb Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_tokens.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_tokens_3cls.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_tokens_3cls.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..755457bea7152b0255933132d414fb8498957795 Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_tokens_3cls.cpython-38.pyc differ diff --git a/src/datasets/__pycache__/toxic_spans_tokens_spans.cpython-38.pyc b/src/datasets/__pycache__/toxic_spans_tokens_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a66bc618223b8d580d6692b2175cd4f447bb287 Binary files /dev/null and b/src/datasets/__pycache__/toxic_spans_tokens_spans.cpython-38.pyc differ diff --git a/src/datasets/toxic_spans_crf_3cls_tokens.py b/src/datasets/toxic_spans_crf_3cls_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..81442d2a76926154bd4df106a84ddb7e17b18c4f --- /dev/null +++ b/src/datasets/toxic_spans_crf_3cls_tokens.py @@ -0,0 +1,132 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +from datasets import load_dataset +import numpy as np + + +@configmapper.map("datasets", "toxic_spans_crf_3cls_tokens") +class ToxicSpansCRF3ClsTokenDataset: + def __init__(self, config): + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + self.tokenized_inputs = self.dataset.map( + self.tokenize_and_align_labels_for_train, batched=True + ) + self.test_tokenized_inputs = self.test_dataset.map( + self.tokenize_for_test, batched=True + ) + + def tokenize_and_align_labels_for_train(self, examples): + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + + # tokenized_inputs["text"] = examples["text"] + example_spans = [] + labels = [] + prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"])) + offsets_mapping = tokenized_inputs["offset_mapping"] + + ## Wrong Code + # for i, offset_mapping in enumerate(offsets_mapping): + # j = 0 + # while j < len(offset_mapping): # [tok1, tok2, tok3] [(0,5),(1,4),(5,7)] + # if tokenized_inputs["input_ids"][i][j] in [ + # self.tokenizer.sep_token_id, + # self.tokenizer.pad_token_id, + # self.tokenizer.cls_token_id, + # ]: + # j = j + 1 + # continue + # else: + # k = j + 1 + # while self.tokenizer.convert_ids_to_tokens( + # tokenized_inputs["input_ids"][i][k] + # ).startswith("##"): + # offset_mapping[i][j][1] = offset_mapping[i][k][1] + # j = k + + for i, offset_mapping in enumerate(offsets_mapping): + labels.append([]) + + spans = eval(examples["spans"][i]) + Bs = eval(examples["Bs"][i]) + Is = eval(examples["Is"][i]) + + example_spans.append(spans) + # cls_label = 2 ## DUMMY LABEL + cls_label = 3 ## DUMMY LABEL + for j, offsets in enumerate(offset_mapping): + if tokenized_inputs["input_ids"][i][j] in [ + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, + ]: + tokenized_inputs["attention_mask"][i][j] = 0 + + if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id: + labels[-1].append(cls_label) + prediction_mask[i][j] = 1 + + elif offsets[0] == offsets[1] and offsets[0] == 0: + # labels[-1].append(2) ## DUMMY + labels[-1].append(cls_label) ## DUMMY + + else: + # toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])] + # ## If any part of the the token is in span, mark it as Toxic + # if ( + # len(toxic_offsets) > 0 + # and sum(toxic_offsets) / len(toxic_offsets) > 0.0 + # ): + # labels[-1].append(1) + # else: + # labels[-1].append(0) + # prediction_mask[i][j] = 1 + + b_off = [x in Bs for x in range(offsets[0], offsets[1])] + b_off = sum(b_off) + i_off = [x in Is for x in range(offsets[0], offsets[1])] + i_off = sum(i_off) + # if len(b_off) == len(i_off) and len(i_off) == 0: + if b_off == 0 and i_off == 0: + labels[-1].append(0) + # elif len(b_off) >= len(i_off) == 1: + elif b_off >= i_off: + labels[-1].append(1) + # print(b_off) + # print(i_off) + # print(j) + else: + labels[-1].append(2) + + tokenized_inputs["labels"] = labels + tokenized_inputs["prediction_mask"] = prediction_mask + return tokenized_inputs + + def tokenize_for_test(self, examples): + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"])) + labels = np.zeros_like(np.array(tokenized_inputs["input_ids"])) + + offsets_mapping = tokenized_inputs["offset_mapping"] + + for i, offset_mapping in enumerate(offsets_mapping): + for j, offsets in enumerate(offset_mapping): + if tokenized_inputs["input_ids"][i][j] in [ + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, + ]: + tokenized_inputs["attention_mask"][i][j] = 0 + else: + prediction_mask[i][j] = 1 + + tokenized_inputs["prediction_mask"] = prediction_mask + tokenized_inputs["labels"] = labels + return tokenized_inputs diff --git a/src/datasets/toxic_spans_crf_tokens.py b/src/datasets/toxic_spans_crf_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..ef62e2ee52aee96725e832bcfef52bd57d60612c --- /dev/null +++ b/src/datasets/toxic_spans_crf_tokens.py @@ -0,0 +1,111 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +from datasets import load_dataset +import numpy as np + + +@configmapper.map("datasets", "toxic_spans_crf_tokens") +class ToxicSpansCRFTokenDataset: + def __init__(self, config): + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + self.tokenized_inputs = self.dataset.map( + self.tokenize_and_align_labels_for_train, batched=True + ) + self.test_tokenized_inputs = self.test_dataset.map( + self.tokenize_for_test, batched=True + ) + + def tokenize_and_align_labels_for_train(self, examples): + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + + # tokenized_inputs["text"] = examples["text"] + example_spans = [] + labels = [] + prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"])) + offsets_mapping = tokenized_inputs["offset_mapping"] + + ## Wrong Code + # for i, offset_mapping in enumerate(offsets_mapping): + # j = 0 + # while j < len(offset_mapping): # [tok1, tok2, tok3] [(0,5),(1,4),(5,7)] + # if tokenized_inputs["input_ids"][i][j] in [ + # self.tokenizer.sep_token_id, + # self.tokenizer.pad_token_id, + # self.tokenizer.cls_token_id, + # ]: + # j = j + 1 + # continue + # else: + # k = j + 1 + # while self.tokenizer.convert_ids_to_tokens( + # tokenized_inputs["input_ids"][i][k] + # ).startswith("##"): + # offset_mapping[i][j][1] = offset_mapping[i][k][1] + # j = k + + for i, offset_mapping in enumerate(offsets_mapping): + labels.append([]) + + spans = eval(examples["spans"][i]) + example_spans.append(spans) + cls_label = 2 ## DUMMY LABEL + for j, offsets in enumerate(offset_mapping): + if tokenized_inputs["input_ids"][i][j] in [ + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, + ]: + tokenized_inputs["attention_mask"][i][j] = 0 + + if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id: + labels[-1].append(cls_label) + prediction_mask[i][j] = 1 + + elif offsets[0] == offsets[1] and offsets[0] == 0: + labels[-1].append(2) ## DUMMY + + else: + toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])] + ## If any part of the the token is in span, mark it as Toxic + if ( + len(toxic_offsets) > 0 + and sum(toxic_offsets) / len(toxic_offsets) > 0.0 + ): + labels[-1].append(1) + else: + labels[-1].append(0) + prediction_mask[i][j] = 1 + + tokenized_inputs["labels"] = labels + tokenized_inputs["prediction_mask"] = prediction_mask + return tokenized_inputs + + def tokenize_for_test(self, examples): + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"])) + labels = np.zeros_like(np.array(tokenized_inputs["input_ids"])) + + offsets_mapping = tokenized_inputs["offset_mapping"] + + for i, offset_mapping in enumerate(offsets_mapping): + for j, offsets in enumerate(offset_mapping): + if tokenized_inputs["input_ids"][i][j] in [ + self.tokenizer.sep_token_id, + self.tokenizer.pad_token_id, + ]: + tokenized_inputs["attention_mask"][i][j] = 0 + else: + prediction_mask[i][j] = 1 + + tokenized_inputs["prediction_mask"] = prediction_mask + tokenized_inputs["labels"] = labels + return tokenized_inputs diff --git a/src/datasets/toxic_spans_multi_spans.py b/src/datasets/toxic_spans_multi_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..445e9f66316a037c59cec136ff5c08b017e78d7a --- /dev/null +++ b/src/datasets/toxic_spans_multi_spans.py @@ -0,0 +1,237 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +import pandas as pd +from datasets import load_dataset, Dataset +from evaluation.fix_spans import _contiguous_ranges + + +@configmapper.map("datasets", "toxic_spans_multi_spans") +class ToxicSpansMultiSpansDataset: + def __init__(self, config): + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + temp_key_train = list(self.dataset.keys())[0] + self.intermediate_dataset = self.dataset.map( + self.create_train_features, + batched=True, + batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping + remove_columns=self.dataset[temp_key_train].column_names, + ) + + temp_key_test = list(self.test_dataset.keys())[0] + self.intermediate_test_dataset = self.test_dataset.map( + self.create_test_features, + batched=True, + batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping + remove_columns=self.test_dataset[temp_key_test].column_names, + ) + + self.tokenized_inputs = self.intermediate_dataset.map( + self.prepare_train_features, + batched=True, + remove_columns=self.intermediate_dataset[temp_key_train].column_names, + ) + self.test_tokenized_inputs = self.intermediate_test_dataset.map( + self.prepare_test_features, + batched=True, + remove_columns=self.intermediate_test_dataset[temp_key_test].column_names, + ) + + def create_train_features(self, examples): + features = { + "context": [], + "id": [], + "question": [], + "title": [], + "start_positions": [], + "end_positions": [], + } + id = 0 + # print(examples) + for row_number in range(len(examples["text"])): + context = examples["text"][row_number] + question = "offense" + title = context.split(" ")[0] + start_positions = [] + end_positions = [] + span = eval(examples["spans"][row_number]) + contiguous_spans = _contiguous_ranges(span) + for lst in contiguous_spans: + lst = list(lst) + dict_to_write = {} + + start_positions.append(lst[0]) + end_positions.append(lst[1]) + + features["context"].append(context) + features["id"].append(str(id)) + features["question"].append(question) + features["title"].append(title) + features["start_positions"].append(start_positions) + features["end_positions"].append(end_positions) + id += 1 + + return features + + def create_test_features(self, examples): + features = {"context": [], "id": [], "question": [], "title": []} + id = 0 + for row_number in range(len(examples["text"])): + context = examples["text"][row_number] + question = "offense" + title = context.split(" ")[0] + features["context"].append(context) + features["id"].append(str(id)) + features["question"].append(question) + features["title"].append(title) + id += 1 + return features + + def prepare_train_features(self, examples): + """Generate tokenized features from examples. + + Args: + examples (dict): The examples to be tokenized. + + Returns: + transformers.tokenization_utils_base.BatchEncoding: + The tokenized features/examples after processing. + """ + # Tokenize our examples with truncation and padding, but keep the + # overflows using a stride. This results in one example possible + # giving several features when a context is long, each of those + # features having a context that overlaps a bit the context + # of the previous feature. + pad_on_right = self.tokenizer.padding_side == "right" + print("### Batch Tokenizing Examples ###") + tokenized_examples = self.tokenizer( + examples["question" if pad_on_right else "context"], + examples["context" if pad_on_right else "question"], + **dict(self.config.tokenizer_params), + ) + + # Since one example might give us several features if it has + # a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to + # character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + + # Grab the sequence corresponding to that example + # (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of + # the example containing this span of text. + sample_index = sample_mapping[i] + start_positions = examples["start_positions"][sample_index] + end_positions = examples["end_positions"][sample_index] + + start_positions_token_wise = [0 for x in range(len(input_ids))] + end_positions_token_wise = [0 for x in range(len(input_ids))] + # If no answers are given, set the cls_index as answer. + if len(start_positions) != 0: + for position in range(len(start_positions)): + start_char = start_positions[position] + end_char = end_positions[position] + 1 + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span (in which case we continue). + if not ( + offsets[token_start_index][0] <= start_char + and offsets[token_end_index][1] >= end_char + ): + continue + else: + # Otherwise move the token_start_index and token_end_index to the two ends of the answer. + # Note: we could go after the last offset if the answer is the last word (edge case). + while ( + token_start_index < len(offsets) + and offsets[token_start_index][0] <= start_char + ): + token_start_index += 1 + start_positions_token_wise[token_start_index - 1] = 1 + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + end_positions_token_wise[token_end_index + 1] = 1 + tokenized_examples["start_positions"].append(start_positions_token_wise) + tokenized_examples["end_positions"].append(start_positions_token_wise) + return tokenized_examples + + def prepare_test_features(self, examples): + + """Generate tokenized validation features from examples. + + Args: + examples (dict): The validation examples to be tokenized. + + Returns: + transformers.tokenization_utils_base.BatchEncoding: + The tokenized features/examples for validation set after processing. + """ + + # Tokenize our examples with truncation and maybe + # padding, but keep the overflows using a stride. + # This results in one example possible giving several features + # when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + print("### Tokenizing Validation Examples") + pad_on_right = self.tokenizer.padding_side == "right" + tokenized_examples = self.tokenizer( + examples["question" if pad_on_right else "context"], + examples["context" if pad_on_right else "question"], + **dict(self.config.tokenizer_params), + ) + + # Since one example might give us several features if it has a long context, + # we need a map from a feature to its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # We keep the example_id that gave us this feature and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example + # (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, + # this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(str(examples["id"][sample_index])) + + # Set to None the offset_mapping that are not part + # of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples diff --git a/src/datasets/toxic_spans_spans.py b/src/datasets/toxic_spans_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..9f034ee428a56569eaec41b84a8d7078a442dd03 --- /dev/null +++ b/src/datasets/toxic_spans_spans.py @@ -0,0 +1,238 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +import pandas as pd +from datasets import load_dataset, Dataset +from evaluation.fix_spans import _contiguous_ranges + + +@configmapper.map("datasets", "toxic_spans_spans") +class ToxicSpansSpansDataset: + def __init__(self, config): + # print("### ToxicSpansSpansDataset ###"); exit() + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + temp_key_train = list(self.dataset.keys())[0] + self.intermediate_dataset = self.dataset.map( + self.create_train_features, + batched=True, + batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping + remove_columns=self.dataset[temp_key_train].column_names, + ) + + temp_key_test = list(self.test_dataset.keys())[0] + self.intermediate_test_dataset = self.test_dataset.map( + self.create_test_features, + batched=True, + batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping + remove_columns=self.test_dataset[temp_key_test].column_names, + ) + + self.tokenized_inputs = self.intermediate_dataset.map( + self.prepare_train_features, + batched=True, + remove_columns=self.intermediate_dataset[temp_key_train].column_names, + ) + self.test_tokenized_inputs = self.intermediate_test_dataset.map( + self.prepare_test_features, + batched=True, + remove_columns=self.intermediate_test_dataset[temp_key_test].column_names, + ) + + def create_train_features(self, examples): + features = {"context": [], "id": [], "question": [], "title": []} + id = 0 + # print(examples) + for row_number in range(len(examples["text"])): + context = examples["text"][row_number] + # question = "offense" + question = "ভুল" + title = context.split(" ")[0] + span = eval(examples["spans"][row_number]) + contiguous_spans = _contiguous_ranges(span) + for lst in contiguous_spans: + lst = list(lst) + dict_to_write = {} + + dict_to_write["answer_start"] = [lst[0]] + dict_to_write["text"] = [context[lst[0] : lst[-1] + 1]] + # print(dict_to_write) + if "answers" in features.keys(): + features["answers"].append(dict_to_write) + else: + features["answers"] = [ + dict_to_write, + ] + features["context"].append(context) + features["id"].append(str(id)) + features["question"].append(question) + features["title"].append(title) + id += 1 + + return features + + def create_test_features(self, examples): + features = {"context": [], "id": [], "question": [], "title": []} + id = 0 + for row_number in range(len(examples["text"])): + context = examples["text"][row_number] + # question = "offense" + question = "ভুল" + title = context.split(" ")[0] + features["context"].append(context) + features["id"].append(str(id)) + features["question"].append(question) + features["title"].append(title) + id += 1 + return features + + def prepare_train_features(self, examples): + """Generate tokenized features from examples. + + Args: + examples (dict): The examples to be tokenized. + + Returns: + transformers.tokenization_utils_base.BatchEncoding: + The tokenized features/examples after processing. + """ + # Tokenize our examples with truncation and padding, but keep the + # overflows using a stride. This results in one example possible + # giving several features when a context is long, each of those + # features having a context that overlaps a bit the context + # of the previous feature. + pad_on_right = self.tokenizer.padding_side == "right" + print("### Batch Tokenizing Examples ###") + tokenized_examples = self.tokenizer( + examples["question" if pad_on_right else "context"], + examples["context" if pad_on_right else "question"], + **dict(self.config.tokenizer_params), + ) + + # Since one example might give us several features if it has + # a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to + # character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + input_ids = tokenized_examples["input_ids"][i] + cls_index = input_ids.index(self.tokenizer.cls_token_id) + + # Grab the sequence corresponding to that example + # (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of + # the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples["answers"][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span + # (in which case this feature is labeled with the CLS index). + if not ( + offsets[token_start_index][0] <= start_char + and offsets[token_end_index][1] >= end_char + ): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and + # stoken_end_index to the two ends of the answer. + # Note: we could go after the last offset + # if the answer is the last word (edge case). + while ( + token_start_index < len(offsets) + and offsets[token_start_index][0] <= start_char + ): + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + + return tokenized_examples + + def prepare_test_features(self, examples): + + """Generate tokenized validation features from examples. + + Args: + examples (dict): The validation examples to be tokenized. + + Returns: + transformers.tokenization_utils_base.BatchEncoding: + The tokenized features/examples for validation set after processing. + """ + + # Tokenize our examples with truncation and maybe + # padding, but keep the overflows using a stride. + # This results in one example possible giving several features + # when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + print("### Tokenizing Validation Examples") + pad_on_right = self.tokenizer.padding_side == "right" + tokenized_examples = self.tokenizer( + examples["question" if pad_on_right else "context"], + examples["context" if pad_on_right else "question"], + **dict(self.config.tokenizer_params), + ) + + # Since one example might give us several features if it has a long context, + # we need a map from a feature to its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # We keep the example_id that gave us this feature and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example + # (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, + # this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(str(examples["id"][sample_index])) + + # Set to None the offset_mapping that are not part + # of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples diff --git a/src/datasets/toxic_spans_tokens.py b/src/datasets/toxic_spans_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..d48c2b7d66e57ca27bb33da80373ac86c9856c15 --- /dev/null +++ b/src/datasets/toxic_spans_tokens.py @@ -0,0 +1,81 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +from datasets import load_dataset + +# import pdb + +@configmapper.map("datasets", "toxic_spans_tokens") +class ToxicSpansTokenDataset: + def __init__(self, config): + # print("### ToxicSpansTokenDataset ###"); exit() + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + # if self.config.model_checkpoint_name == "sberbank-ai/mGPT": + # self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + self.tokenized_inputs = self.dataset.map( + self.tokenize_and_align_labels_for_train, batched=True + ) + self.test_tokenized_inputs = self.test_dataset.map( + self.tokenize_for_test, batched=True + ) + + def tokenize_and_align_labels_for_train(self, examples): + + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + + # tokenized_inputs["text"] = examples["text"] + example_spans = [] + labels = [] + + offsets_mapping = tokenized_inputs["offset_mapping"] + # pdb.set_trace() + for i, offset_mapping in enumerate(offsets_mapping): + labels.append([]) + + spans = eval(examples["spans"][i]) + example_spans.append(spans) + if self.config.label_cls: + cls_label = ( + 1 + if ( + len(examples["text"][i]) > 0 + and len(spans) / len(examples["text"][i]) + > self.config.cls_threshold + ) + else 0 + ) ## Make class label based on threshold + else: + cls_label = -100 + for j, offsets in enumerate(offset_mapping): + if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id: + labels[-1].append(cls_label) + elif offsets[0] == offsets[1] and offsets[0] == 0: # All zero + labels[-1].append(-100) ## SPECIAL TOKEN + else: + toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])] + ## If any part of the the token is in span, mark it as Toxic + if ( + len(toxic_offsets) > 0 + and sum(toxic_offsets) / len(toxic_offsets) + > self.config.token_threshold + ): + labels[-1].append(1) + else: + labels[-1].append(0) + + tokenized_inputs["labels"] = labels + # print("tokenized_inputs", tokenized_inputs); exit() + return tokenized_inputs + + def tokenize_for_test(self, examples): + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + return tokenized_inputs diff --git a/src/datasets/toxic_spans_tokens_3cls.py b/src/datasets/toxic_spans_tokens_3cls.py new file mode 100644 index 0000000000000000000000000000000000000000..7bdb664aca1834d9514e5f98dc7effd1564560f5 --- /dev/null +++ b/src/datasets/toxic_spans_tokens_3cls.py @@ -0,0 +1,102 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +from datasets import load_dataset + +import pdb + +@configmapper.map("datasets", "toxic_spans_tokens_3cls") +class ToxicSpansToken3CLSDataset: + def __init__(self, config): + # print("### ToxicSpansTokenDataset ###"); exit() + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + # if self.config.model_checkpoint_name == "sberbank-ai/mGPT": + # self.tokenizer.add_special_tokens({'pad_token': '[PAD]'}) + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + self.tokenized_inputs = self.dataset.map( + self.tokenize_and_align_labels_for_train, batched=True + ) + self.test_tokenized_inputs = self.test_dataset.map( + self.tokenize_for_test, batched=True + ) + + def tokenize_and_align_labels_for_train(self, examples): + + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + + # tokenized_inputs["text"] = examples["text"] + example_spans = [] + labels = [] + + offsets_mapping = tokenized_inputs["offset_mapping"] + # pdb.set_trace() + for i, offset_mapping in enumerate(offsets_mapping): + labels.append([]) + + spans = eval(examples["spans"][i]) + Bs = eval(examples["Bs"][i]) + Is = eval(examples["Is"][i]) + example_spans.append(spans) + if self.config.label_cls: + cls_label = ( + 1 + if ( + len(examples["text"][i]) > 0 + and len(spans) / len(examples["text"][i]) + > self.config.cls_threshold + ) + else 0 + ) ## Make class label based on threshold + else: + cls_label = -100 + for j, offsets in enumerate(offset_mapping): + if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id: + labels[-1].append(cls_label) + elif offsets[0] == offsets[1] and offsets[0] == 0: # All zero + labels[-1].append(-100) ## SPECIAL TOKEN + else: + # toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])] + ## If any part of the the token is in span, mark it as Toxic + # if ( + # len(toxic_offsets) > 0 + # and sum(toxic_offsets) / len(toxic_offsets) + # > self.config.token_threshold + # ): + # labels[-1].append(1) + # else: + # labels[-1].append(0) + b_off = [x in Bs for x in range(offsets[0], offsets[1])] + b_off = sum(b_off) + i_off = [x in Is for x in range(offsets[0], offsets[1])] + i_off = sum(i_off) + # if len(b_off) == len(i_off) and len(i_off) == 0: + if b_off == 0 and i_off == 0: + labels[-1].append(0) + # elif len(b_off) >= len(i_off) == 1: + elif b_off >= i_off: + labels[-1].append(1) + # print(b_off) + # print(i_off) + # print(j) + else: + labels[-1].append(2) + + # pdb.set_trace() + + + + tokenized_inputs["labels"] = labels + # print("tokenized_inputs", tokenized_inputs); exit() + return tokenized_inputs + + def tokenize_for_test(self, examples): + tokenized_inputs = self.tokenizer( + examples["text"], **self.config.tokenizer_params + ) + return tokenized_inputs diff --git a/src/datasets/toxic_spans_tokens_spans.py b/src/datasets/toxic_spans_tokens_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..bd1b6e68d3b115b1cd0be0b417e34da31ab4ee0c --- /dev/null +++ b/src/datasets/toxic_spans_tokens_spans.py @@ -0,0 +1,269 @@ +from src.utils.mapper import configmapper +from transformers import AutoTokenizer +import pandas as pd +from datasets import load_dataset, Dataset +from evaluation.fix_spans import _contiguous_ranges + + +@configmapper.map("datasets", "toxic_spans_tokens_spans") +class ToxicSpansTokensSpansDataset: + def __init__(self, config): + self.config = config + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_checkpoint_name + ) + + self.dataset = load_dataset("csv", data_files=dict(self.config.train_files)) + self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files)) + + temp_key_train = list(self.dataset.keys())[0] + self.intermediate_dataset = self.dataset.map( + self.create_train_features, + batched=True, + batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping + remove_columns=self.dataset[temp_key_train].column_names, + ) + + temp_key_test = list(self.test_dataset.keys())[0] + self.intermediate_test_dataset = self.test_dataset.map( + self.create_test_features, + batched=True, + batch_size=1000000, ##Unusually Large Batch Size ## Needed For Correct ID mapping + remove_columns=self.test_dataset[temp_key_test].column_names, + ) + + self.tokenized_inputs = self.intermediate_dataset.map( + self.prepare_train_features, + batched=True, + remove_columns=self.intermediate_dataset[temp_key_train].column_names, + ) + self.test_tokenized_inputs = self.intermediate_test_dataset.map( + self.prepare_test_features, + batched=True, + remove_columns=self.intermediate_test_dataset[temp_key_test].column_names, + ) + + def create_train_features(self, examples): + features = {"context": [], "id": [], "question": [], "title": [], "spans": []} + id = 0 + # print(examples) + for row_number in range(len(examples["text"])): + context = examples["text"][row_number] + question = "offense" + title = context.split(" ")[0] + span = eval(examples["spans"][row_number]) + contiguous_spans = _contiguous_ranges(span) + for lst in contiguous_spans: + lst = list(lst) + dict_to_write = {} + + dict_to_write["answer_start"] = [lst[0]] + dict_to_write["text"] = [context[lst[0] : lst[-1] + 1]] + # print(dict_to_write) + if "answers" in features.keys(): + features["answers"].append(dict_to_write) + else: + features["answers"] = [ + dict_to_write, + ] + features["context"].append(context) + features["id"].append(str(id)) + features["question"].append(question) + features["title"].append(title) + features["spans"].append(span) + id += 1 + + return features + + def create_test_features(self, examples): + features = {"context": [], "id": [], "question": [], "title": []} + id = 0 + for row_number in range(len(examples["text"])): + context = examples["text"][row_number] + question = "offense" + title = context.split(" ")[0] + features["context"].append(context) + features["id"].append(str(id)) + features["question"].append(question) + features["title"].append(title) + id += 1 + return features + + def prepare_train_features(self, examples): + """Generate tokenized features from examples. + + Args: + examples (dict): The examples to be tokenized. + + Returns: + transformers.tokenization_utils_base.BatchEncoding: + The tokenized features/examples after processing. + """ + # Tokenize our examples with truncation and padding, but keep the + # overflows using a stride. This results in one example possible + # giving several features when a context is long, each of those + # features having a context that overlaps a bit the context + # of the previous feature. + pad_on_right = self.tokenizer.padding_side == "right" + print("### Batch Tokenizing Examples ###") + tokenized_examples = self.tokenizer( + examples["question" if pad_on_right else "context"], + examples["context" if pad_on_right else "question"], + **dict(self.config.tokenizer_params), + ) + + # Since one example might give us several features if it has + # a long context, we need a map from a feature to + # its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + # The offset mappings will give us a map from token to + # character position in the original context. This will + # help us compute the start_positions and end_positions. + offset_mapping = tokenized_examples.pop("offset_mapping") + + # Let's label those examples! + token_labels = [] + tokenized_examples["start_positions"] = [] + tokenized_examples["end_positions"] = [] + + for i, offsets in enumerate(offset_mapping): + # We will label impossible answers with the index of the CLS token. + + token_labels.append([]) + input_ids = tokenized_examples["input_ids"][i] + spans = examples["spans"][i] + if self.config.label_cls: + cls_label = ( + 1 + if ( + len(examples["context"][i]) > 0 + and len(spans) / len(examples["context"][i]) + > self.config.cls_threshold + ) + else 0 + ) ## Make class label based on threshold + else: + cls_label = -100 + for j, offset in enumerate(offsets): + if tokenized_examples["input_ids"][i][j] == self.tokenizer.cls_token_id: + token_labels[-1].append(cls_label) + elif offset[0] == offset[1] and offset[0] == 0: + token_labels[-1].append(-100) ## SPECIAL TOKEN + else: + toxic_offsets = [x in spans for x in range(offset[0], offset[1])] + ## If any part of the the token is in span, mark it as Toxic + if ( + len(toxic_offsets) > 0 + and sum(toxic_offsets) / len(toxic_offsets) + > self.config.token_threshold + ): + token_labels[-1].append(1) + else: + token_labels[-1].append(0) + + cls_index = input_ids.index(self.tokenizer.cls_token_id) + + # Grab the sequence corresponding to that example + # (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + + # One example can give several spans, this is the index of + # the example containing this span of text. + sample_index = sample_mapping[i] + answers = examples["answers"][sample_index] + # If no answers are given, set the cls_index as answer. + if len(answers["answer_start"]) == 0: + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Start/end character index of the answer in the text. + start_char = answers["answer_start"][0] + end_char = start_char + len(answers["text"][0]) + + # Start token index of the current span in the text. + token_start_index = 0 + while sequence_ids[token_start_index] != (1 if pad_on_right else 0): + token_start_index += 1 + + # End token index of the current span in the text. + token_end_index = len(input_ids) - 1 + while sequence_ids[token_end_index] != (1 if pad_on_right else 0): + token_end_index -= 1 + + # Detect if the answer is out of the span + # (in which case this feature is labeled with the CLS index). + if not ( + offsets[token_start_index][0] <= start_char + and offsets[token_end_index][1] >= end_char + ): + tokenized_examples["start_positions"].append(cls_index) + tokenized_examples["end_positions"].append(cls_index) + else: + # Otherwise move the token_start_index and + # stoken_end_index to the two ends of the answer. + # Note: we could go after the last offset + # if the answer is the last word (edge case). + while ( + token_start_index < len(offsets) + and offsets[token_start_index][0] <= start_char + ): + token_start_index += 1 + tokenized_examples["start_positions"].append(token_start_index - 1) + while offsets[token_end_index][1] >= end_char: + token_end_index -= 1 + tokenized_examples["end_positions"].append(token_end_index + 1) + tokenized_examples["labels"] = token_labels + return tokenized_examples + + def prepare_test_features(self, examples): + + """Generate tokenized validation features from examples. + + Args: + examples (dict): The validation examples to be tokenized. + + Returns: + transformers.tokenization_utils_base.BatchEncoding: + The tokenized features/examples for validation set after processing. + """ + + # Tokenize our examples with truncation and maybe + # padding, but keep the overflows using a stride. + # This results in one example possible giving several features + # when a context is long, each of those features having a + # context that overlaps a bit the context of the previous feature. + print("### Tokenizing Validation Examples") + pad_on_right = self.tokenizer.padding_side == "right" + tokenized_examples = self.tokenizer( + examples["question" if pad_on_right else "context"], + examples["context" if pad_on_right else "question"], + **dict(self.config.tokenizer_params), + ) + + # Since one example might give us several features if it has a long context, + # we need a map from a feature to its corresponding example. This key gives us just that. + sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") + + # We keep the example_id that gave us this feature and we will store the offset mappings. + tokenized_examples["example_id"] = [] + + for i in range(len(tokenized_examples["input_ids"])): + # Grab the sequence corresponding to that example + # (to know what is the context and what is the question). + sequence_ids = tokenized_examples.sequence_ids(i) + context_index = 1 if pad_on_right else 0 + + # One example can give several spans, + # this is the index of the example containing this span of text. + sample_index = sample_mapping[i] + tokenized_examples["example_id"].append(str(examples["id"][sample_index])) + + # Set to None the offset_mapping that are not part + # of the context so it's easy to determine if a token + # position is part of the context or not. + tokenized_examples["offset_mapping"][i] = [ + (o if sequence_ids[k] == context_index else None) + for k, o in enumerate(tokenized_examples["offset_mapping"][i]) + ] + + return tokenized_examples diff --git a/src/models/__init__.py b/src/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..aa5c0b1e176859f857aff494aa67e27206039c84 --- /dev/null +++ b/src/models/__init__.py @@ -0,0 +1,7 @@ +from src.models.auto_models import * +from src.models.bert_token_spans import * +from src.models.roberta_token_spans import * +from src.models.bert_multi_spans import * +from src.models.roberta_multi_spans import * +from src.models.bert_crf_token import * +from src.models.roberta_crf_token import * \ No newline at end of file diff --git a/src/models/__pycache__/__init__.cpython-38.pyc b/src/models/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd2ee719d2d18a32aaa262804ca6a88c398036b2 Binary files /dev/null and b/src/models/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/models/__pycache__/auto_models.cpython-38.pyc b/src/models/__pycache__/auto_models.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0ead5a4736f87412e204c35e68cfc6a2b319b4d0 Binary files /dev/null and b/src/models/__pycache__/auto_models.cpython-38.pyc differ diff --git a/src/models/__pycache__/bert_crf_token.cpython-38.pyc b/src/models/__pycache__/bert_crf_token.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0c4200b2d3bb4040bf4684bd71b4f10183827148 Binary files /dev/null and b/src/models/__pycache__/bert_crf_token.cpython-38.pyc differ diff --git a/src/models/__pycache__/bert_multi_spans.cpython-38.pyc b/src/models/__pycache__/bert_multi_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..961f794199c3c98733b67c464015ec260459c767 Binary files /dev/null and b/src/models/__pycache__/bert_multi_spans.cpython-38.pyc differ diff --git a/src/models/__pycache__/bert_token_spans.cpython-38.pyc b/src/models/__pycache__/bert_token_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4347408c463b2e9f55cc51f13265e14c841eb995 Binary files /dev/null and b/src/models/__pycache__/bert_token_spans.cpython-38.pyc differ diff --git a/src/models/__pycache__/roberta_crf_token.cpython-38.pyc b/src/models/__pycache__/roberta_crf_token.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e0dba29bbbb76c516cd9cb12227ec9d1da6937c Binary files /dev/null and b/src/models/__pycache__/roberta_crf_token.cpython-38.pyc differ diff --git a/src/models/__pycache__/roberta_multi_spans.cpython-38.pyc b/src/models/__pycache__/roberta_multi_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ac653a17f849575090449f0eb40c9240ec6182c7 Binary files /dev/null and b/src/models/__pycache__/roberta_multi_spans.cpython-38.pyc differ diff --git a/src/models/__pycache__/roberta_token_spans.cpython-38.pyc b/src/models/__pycache__/roberta_token_spans.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f18266dedc1620d040c529b989143ee88d6d74a9 Binary files /dev/null and b/src/models/__pycache__/roberta_token_spans.cpython-38.pyc differ diff --git a/src/models/auto_models.py b/src/models/auto_models.py new file mode 100644 index 0000000000000000000000000000000000000000..df89ff6548111df9bca5a070a039814767aad8de --- /dev/null +++ b/src/models/auto_models.py @@ -0,0 +1,6 @@ +from transformers import AutoModelForTokenClassification, AutoModelForQuestionAnswering +from src.utils.mapper import configmapper + +configmapper.map("models", "autotoken")(AutoModelForTokenClassification) +configmapper.map("models", "autotoken_3cls")(AutoModelForTokenClassification) +configmapper.map("models", "autospans")(AutoModelForQuestionAnswering) diff --git a/src/models/bert_crf_token.py b/src/models/bert_crf_token.py new file mode 100644 index 0000000000000000000000000000000000000000..71466dcb2da1a5aaedd31faa761e3343f7c0ed5c --- /dev/null +++ b/src/models/bert_crf_token.py @@ -0,0 +1,72 @@ +import torch +# from transformers import BertForTokenClassification +from transformers import ElectraForTokenClassification +from torchcrf import CRF +from src.utils.mapper import configmapper +# import pdb + + +@configmapper.map("models", "bert_crf_token") +# class BertLSTMCRF(BertForTokenClassification): +class BertLSTMCRF(ElectraForTokenClassification): + def __init__(self, config, lstm_hidden_size, lstm_layers): + super().__init__(config) + # ipdb.set_trace() + self.lstm = torch.nn.LSTM( + input_size=config.hidden_size, + hidden_size=lstm_hidden_size, + num_layers=lstm_layers, + dropout=0.2, + batch_first=True, + bidirectional=True, + ) + self.crf = CRF(config.num_labels, batch_first=True) + + del self.classifier + self.classifier = torch.nn.Linear(2 * lstm_hidden_size, config.num_labels) + + def forward( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + labels=None, + prediction_mask=None, + ): + # pdb.set_trace() + + # outputs = self.bert( + outputs = self.electra( + input_ids, + attention_mask, + token_type_ids, + output_hidden_states=True, + return_dict=False, + ) + # seq_output, all_hidden_states, all_self_attntions, all_cross_attentions + + sequence_output = outputs[0] # outputs[1] is pooled output which is none. + + sequence_output = self.dropout(sequence_output) + + lstm_out, *_ = self.lstm(sequence_output) + sequence_output = self.dropout(lstm_out) + + logits = self.classifier(sequence_output) + + ## CRF + mask = prediction_mask + mask = mask[:, : logits.size(1)].contiguous() + + # print(logits) + + if labels is not None: + labels = labels[:, : logits.size(1)].contiguous() + loss = -self.crf(logits, labels, mask=mask.bool(), reduction="token_mean") + + tags = self.crf.decode(logits, mask.bool()) + # print(tags) + if labels is not None: + return (loss, logits, tags) + else: + return (logits, tags) diff --git a/src/models/bert_multi_spans.py b/src/models/bert_multi_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..4d9c92803af525958bf453735880db1681743afc --- /dev/null +++ b/src/models/bert_multi_spans.py @@ -0,0 +1,84 @@ +import torch.nn as nn +from torch.nn import BCEWithLogitsLoss +# from transformers import BertModel, BertPreTrainedModel +from transformers import ElectraPreTrainedModel, ElectraModel +from src.utils.mapper import configmapper + + +@configmapper.map("models", "bert_multi_spans") +# class BertForMultiSpans(BertPreTrainedModel): +class BertForMultiSpans(ElectraPreTrainedModel): + def __init__(self, config): + super(BertForMultiSpans, self).__init__(config) + # self.bert = BertModel(config) + self.bert = ElectraModel(config) + self.num_labels = config.num_labels + + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + ): + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=None, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) # batch_size + # print(start_logits.shape, end_logits.shape, start_positions.shape, end_positions.shape) + + total_loss = None + if ( + start_positions is not None and end_positions is not None + ): # [batch_size/seq_length] + # # If we are on multi-GPU, split add a dimension + # if len(start_positions.size()) > 1: + # start_positions = start_positions.squeeze(-1) + # if len(end_positions.size()) > 1: + # end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + # ignored_index = start_logits.size(1) + # start_positions.clamp_(0, ignored_index) + # end_positions.clamp_(0, ignored_index) + + # start_positions = start_logits.view() + + loss_fct = BCEWithLogitsLoss() + + start_loss = loss = loss_fct( + start_logits, + start_positions.float(), + ) + end_loss = loss = loss_fct( + end_logits, + end_positions.float(), + ) + total_loss = (start_loss + end_loss) / 2 + + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output \ No newline at end of file diff --git a/src/models/bert_token_spans.py b/src/models/bert_token_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..b2d2d6fc4f2e12c557a5fe2a30e41a14085acdd8 --- /dev/null +++ b/src/models/bert_token_spans.py @@ -0,0 +1,100 @@ +import torch.nn as nn +import torch +from torch.nn import CrossEntropyLoss +# from transformers import BertPreTrainedModel, BertModel +from transformers import ElectraPreTrainedModel, ElectraModel +from src.utils.mapper import configmapper + + +@configmapper.map("models", "bert_token_spans") +# class BertModelForTokenAndSpans(BertPreTrainedModel): +class BertModelForTokenAndSpans(ElectraPreTrainedModel): + def __init__(self, config, num_token_labels=2, num_qa_labels=2): + super(BertModelForTokenAndSpans, self).__init__(config) + # self.bert = BertModel(config) + self.bert = ElectraModel(config) + self.num_token_labels = num_token_labels + self.num_qa_labels = num_qa_labels + # print("Number of Token Labels: ", num_token_labels); exit() + + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_token_labels) + self.qa_outputs = nn.Linear(config.hidden_size, num_qa_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + labels=None, # Token Wise Labels + output_attentions=None, + output_hidden_states=None, + ): + + outputs = self.bert( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=None, + ) + + sequence_output = outputs[0] + + qa_logits = self.qa_outputs(sequence_output) + start_logits, end_logits = qa_logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + sequence_output = self.dropout(sequence_output) + token_logits = self.classifier(sequence_output) + + total_loss = None + if ( + start_positions is not None + and end_positions is not None + and labels is not None + ): + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + + loss_fct = CrossEntropyLoss() + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = token_logits.view(-1, self.num_token_labels) + active_labels = torch.where( + active_loss, + labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels), + ) + token_loss = loss_fct(active_logits, active_labels) + else: + token_loss = loss_fct( + token_logits.view(-1, self.num_token_labels), labels.view(-1) + ) + + total_loss = (start_loss + end_loss) / 2 + token_loss + + output = (start_logits, end_logits, token_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output \ No newline at end of file diff --git a/src/models/roberta_crf_token.py b/src/models/roberta_crf_token.py new file mode 100644 index 0000000000000000000000000000000000000000..1dc34e96dd9d5798eda2634f18c10d98a05514b6 --- /dev/null +++ b/src/models/roberta_crf_token.py @@ -0,0 +1,66 @@ +import torch +from transformers import RobertaForTokenClassification +from torchcrf import CRF +from src.utils.mapper import configmapper + + +@configmapper.map("models", "roberta_crf_token") +class RobertaLSTMCRF(RobertaForTokenClassification): + def __init__(self, config, lstm_hidden_size, lstm_layers): + super().__init__(config) + self.lstm = torch.nn.LSTM( + input_size=config.hidden_size, + hidden_size=lstm_hidden_size, + num_layers=lstm_layers, + dropout=0.2, + batch_first=True, + bidirectional=True, + ) + self.crf = CRF(config.num_labels, batch_first=True) + + del self.classifier + self.classifier = torch.nn.Linear(2 * lstm_hidden_size, config.num_labels) + + def forward( + self, + input_ids, + attention_mask=None, + token_type_ids=None, + labels=None, + prediction_mask=None, + ): + + outputs = self.roberta( + input_ids, + attention_mask, + token_type_ids, + output_hidden_states=True, + return_dict=False, + ) + # seq_output, all_hidden_states, all_self_attntions, all_cross_attentions + + sequence_output = outputs[0] # outputs[1] is pooled output which is none. + + sequence_output = self.dropout(sequence_output) + + lstm_out, *_ = self.lstm(sequence_output) + sequence_output = self.dropout(lstm_out) + + logits = self.classifier(sequence_output) + + ## CRF + mask = prediction_mask + mask = mask[:, : logits.size(1)].contiguous() + + # print(logits) + + if labels is not None: + labels = labels[:, : logits.size(1)].contiguous() + loss = -self.crf(logits, labels, mask=mask.bool(), reduction="token_mean") + + tags = self.crf.decode(logits, mask.bool()) + # print(tags) + if labels is not None: + return (loss, logits, tags) + else: + return (logits, tags) diff --git a/src/models/roberta_multi_spans.py b/src/models/roberta_multi_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..a3514552138df7ceb518a7c99c148c37659f5178 --- /dev/null +++ b/src/models/roberta_multi_spans.py @@ -0,0 +1,82 @@ +import torch.nn as nn +from torch.nn import BCEWithLogitsLoss +from transformers import RobertaModel +from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel +from src.utils.mapper import configmapper + + +@configmapper.map("models", "roberta_multi_spans") +class RobertaForMultiSpans(RobertaPreTrainedModel): + def __init__(self, config): + super(RobertaForMultiSpans, self).__init__(config) + self.roberta = RobertaModel(config) + self.num_labels = config.num_labels + + # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version + # self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + output_attentions=None, + output_hidden_states=None, + ): + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=None, + ) + + sequence_output = outputs[0] + + logits = self.qa_outputs(sequence_output) + start_logits, end_logits = logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) # batch_size + # print(start_logits.shape, end_logits.shape, start_positions.shape, end_positions.shape) + + total_loss = None + if ( + start_positions is not None and end_positions is not None + ): # [batch_size/seq_length] + # # If we are on multi-GPU, split add a dimension + # if len(start_positions.size()) > 1: + # start_positions = start_positions.squeeze(-1) + # if len(end_positions.size()) > 1: + # end_positions = end_positions.squeeze(-1) + # sometimes the start/end positions are outside our model inputs, we ignore these terms + # ignored_index = start_logits.size(1) + # start_positions.clamp_(0, ignored_index) + # end_positions.clamp_(0, ignored_index) + + # start_positions = start_logits.view() + + loss_fct = BCEWithLogitsLoss() + + start_loss = loss = loss_fct( + start_logits, + start_positions.float(), + ) + end_loss = loss = loss_fct( + end_logits, + end_positions.float(), + ) + total_loss = (start_loss + end_loss) / 2 + + output = (start_logits, end_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output \ No newline at end of file diff --git a/src/models/roberta_token_spans.py b/src/models/roberta_token_spans.py new file mode 100644 index 0000000000000000000000000000000000000000..8e010a42b1a006cbcbf691315bbe64c46dd450f5 --- /dev/null +++ b/src/models/roberta_token_spans.py @@ -0,0 +1,97 @@ +import torch.nn as nn +import torch +from torch.nn import CrossEntropyLoss +from transformers import RobertaModel +from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel +from src.utils.mapper import configmapper + + +@configmapper.map("models", "roberta_token_spans") +class RobertaModelForTokenAndSpans(RobertaPreTrainedModel): + def __init__(self, config, num_token_labels=2, num_qa_labels=2): + super(RobertaModelForTokenAndSpans, self).__init__(config) + self.roberta = RobertaModel(config) + self.num_token_labels = num_token_labels + self.num_qa_labels = num_qa_labels + + self.dropout = nn.Dropout(config.hidden_dropout_prob) + self.classifier = nn.Linear(config.hidden_size, num_token_labels) + self.qa_outputs = nn.Linear(config.hidden_size, num_qa_labels) + self.init_weights() + + def forward( + self, + input_ids=None, + attention_mask=None, + token_type_ids=None, + position_ids=None, + head_mask=None, + inputs_embeds=None, + start_positions=None, + end_positions=None, + labels=None, # Token Wise Labels + output_attentions=None, + output_hidden_states=None, + ): + + outputs = self.roberta( + input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=None, + ) + + sequence_output = outputs[0] + + qa_logits = self.qa_outputs(sequence_output) + start_logits, end_logits = qa_logits.split(1, dim=-1) + start_logits = start_logits.squeeze(-1) + end_logits = end_logits.squeeze(-1) + + sequence_output = self.dropout(sequence_output) + token_logits = self.classifier(sequence_output) + + total_loss = None + if ( + start_positions is not None + and end_positions is not None + and labels is not None + ): + # If we are on multi-GPU, split add a dimension + if len(start_positions.size()) > 1: + start_positions = start_positions.squeeze(-1) + if len(end_positions.size()) > 1: + end_positions = end_positions.squeeze(-1) + + ignored_index = start_logits.size(1) + start_positions.clamp_(0, ignored_index) + end_positions.clamp_(0, ignored_index) + + loss_fct = CrossEntropyLoss(ignore_index=ignored_index) + start_loss = loss_fct(start_logits, start_positions) + end_loss = loss_fct(end_logits, end_positions) + + loss_fct = CrossEntropyLoss() + if attention_mask is not None: + active_loss = attention_mask.view(-1) == 1 + active_logits = token_logits.view(-1, self.num_token_labels) + active_labels = torch.where( + active_loss, + labels.view(-1), + torch.tensor(loss_fct.ignore_index).type_as(labels), + ) + token_loss = loss_fct(active_logits, active_labels) + else: + token_loss = loss_fct( + token_logits.view(-1, self.num_token_labels), labels.view(-1) + ) + + total_loss = (start_loss + end_loss) / 2 + token_loss + + output = (start_logits, end_logits, token_logits) + outputs[2:] + return ((total_loss,) + output) if total_loss is not None else output \ No newline at end of file diff --git a/src/models/two_layer_nn.py b/src/models/two_layer_nn.py new file mode 100644 index 0000000000000000000000000000000000000000..969f22490cdca2eb483a3b650d0be9f9a996a433 --- /dev/null +++ b/src/models/two_layer_nn.py @@ -0,0 +1,46 @@ +"""Implements a two layer Neural Network.""" + +from torch.nn import Module, Linear, ReLU +from src.utils.mapper import configmapper + + +@configmapper.map("models", "two_layer_nn") +class TwoLayerNN(Module): + """Implements two layer neural network. + + Methods: + forward(x_input): Returns the output of the neural network. + """ + + def __init__(self, embedding, dims): + """Construct the two layer Neural Network. + + This method is used to initialize the two layer neural network, + with a given embedding type and corresponding arguments. + + Args: + embedding (torch.nn.Module): The embedding layer for the model. + dims (list): List of dimensions for the neural network, input to output. + """ + super(TwoLayerNN, self).__init__() + + self.embedding = embedding + self.linear1 = Linear(dims[0], dims[1]) + self.relu = ReLU() + self.linear2 = Linear(dims[1], dims[2]) + + def forward(self, x_input): + """ + Return the output of the neural network for an input. + + Args: + x_input (torch.Tensor): The input tensor to the neural network. + + Returns: + x_output (torch.Tensor): The output tensor for the neural network. + """ + output = self.embedding(x_input) + output = self.linear1(output) + output = self.relu(output) + x_output = self.linear2(output) + return x_output diff --git a/src/modules/__init__.py b/src/modules/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/modules/__pycache__/__init__.cpython-38.pyc b/src/modules/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5290fbff4f68909ec67e34b430d32141ca8c6daa Binary files /dev/null and b/src/modules/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/modules/__pycache__/embeddings.cpython-38.pyc b/src/modules/__pycache__/embeddings.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..16d8441cca8e5ee5156823a8b4eabe8a3865be80 Binary files /dev/null and b/src/modules/__pycache__/embeddings.cpython-38.pyc differ diff --git a/src/modules/__pycache__/preprocessors.cpython-38.pyc b/src/modules/__pycache__/preprocessors.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1a33a8d44f2f41672102d52a9afd86ace0056684 Binary files /dev/null and b/src/modules/__pycache__/preprocessors.cpython-38.pyc differ diff --git a/src/modules/__pycache__/tokenizers.cpython-38.pyc b/src/modules/__pycache__/tokenizers.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8613f355e4892f3ef3e4f89789fdd7420d8f3052 Binary files /dev/null and b/src/modules/__pycache__/tokenizers.cpython-38.pyc differ diff --git a/src/modules/activations.py b/src/modules/activations.py new file mode 100644 index 0000000000000000000000000000000000000000..34bc248e43817ca9ab0b674611580076a78fb066 --- /dev/null +++ b/src/modules/activations.py @@ -0,0 +1,6 @@ +import torch.nn as nn +from src.utils.mapper import configmapper + +configmapper.map("activations", "relu")(nn.ReLU) +configmapper.map("activations", "logsoftmax")(nn.LogSoftmax) +configmapper.map("activations", "softmax")(nn.Softmax) diff --git a/src/modules/embeddings.py b/src/modules/embeddings.py new file mode 100644 index 0000000000000000000000000000000000000000..6799ba751625036f5ca49d861a397a2bb4267bf8 --- /dev/null +++ b/src/modules/embeddings.py @@ -0,0 +1,37 @@ +"""Contains various kinds of embeddings like Glove, BERT, etc.""" + +from torch.nn import Module, Embedding, Flatten +from src.utils.mapper import configmapper + + +@configmapper.map("embeddings", "glove") +class GloveEmbedding(Module): + """Implement Glove based Word Embedding.""" + + def __init__(self, embedding_matrix, padding_idx, static=True): + """Construct GloveEmbedding. + + Args: + embedding_matrix (torch.Tensor): The matrix contrainining the embedding weights + padding_idx (int): The padding index in the tokenizer. + static (bool): Whether or not to freeze embeddings. + """ + super(GloveEmbedding, self).__init__() + self.embedding = Embedding.from_pretrained(embedding_matrix) + self.embedding.padding_idx = padding_idx + if static: + self.embedding.weight.required_grad = False + self.flatten = Flatten(start_dim=1) + + def forward(self, x_input): + """Pass the input through the embedding. + + Args: + x_input (torch.Tensor): The numericalized tokenized input + + Returns: + x_output (torch.Tensor): The output from the embedding + """ + x_output = self.embedding(x_input) + x_output = self.flatten(x_output) + return x_output diff --git a/src/modules/losses.py b/src/modules/losses.py new file mode 100644 index 0000000000000000000000000000000000000000..0b470ac132400279cbd847ff3db04dd4aae97f31 --- /dev/null +++ b/src/modules/losses.py @@ -0,0 +1,6 @@ +"All criterion functions." +from torch.nn import MSELoss, CrossEntropyLoss +from src.utils.mapper import configmapper + +configmapper.map("losses", "mse")(MSELoss) +configmapper.map("losses", "CrossEntropyLoss")(CrossEntropyLoss) diff --git a/src/modules/metrics.py b/src/modules/metrics.py new file mode 100644 index 0000000000000000000000000000000000000000..24db6ab3a3905a1de2df85c91f848b4dfa889df0 --- /dev/null +++ b/src/modules/metrics.py @@ -0,0 +1,17 @@ +"""Metrics.""" +from sklearn.metrics import ( + mean_squared_error, + f1_score, + precision_score, + recall_score, + roc_auc_score, + accuracy_score, +) +from src.utils.mapper import configmapper + +configmapper.map("metrics", "sklearn_f1")(f1_score) +configmapper.map("metrics", "sklearn_p")(precision_score) +configmapper.map("metrics", "sklearn_r")(recall_score) +configmapper.map("metrics", "sklearn_roc")(roc_auc_score) +configmapper.map("metrics", "sklearn_acc")(accuracy_score) +configmapper.map("metrics", "sklearn_mse")(mean_squared_error) diff --git a/src/modules/optimizers.py b/src/modules/optimizers.py new file mode 100644 index 0000000000000000000000000000000000000000..ac2fd7365666b9386304f2d27c4f783f456e7cbb --- /dev/null +++ b/src/modules/optimizers.py @@ -0,0 +1,7 @@ +" Method containing activation functions" +from torch.optim import Adam, AdamW, SGD +from src.utils.mapper import configmapper + +configmapper.map("optimizers", "adam")(Adam) +configmapper.map("optimizers", "adam_w")(AdamW) +configmapper.map("optimizers", "sgd")(SGD) diff --git a/src/modules/preprocessors.py b/src/modules/preprocessors.py new file mode 100644 index 0000000000000000000000000000000000000000..d14359cc10417b857b9cf92c200ae061dc8fac71 --- /dev/null +++ b/src/modules/preprocessors.py @@ -0,0 +1,112 @@ +from src.modules.tokenizers import * +from src.modules.embeddings import * +from src.utils.mapper import configmapper + + +class Preprocessor: + def preprocess(self): + pass + + +@configmapper.map("preprocessors", "glove") +class GlovePreprocessor(Preprocessor): + """GlovePreprocessor.""" + + def __init__(self, config): + """ + Args: + config (src.utils.module.Config): configuration for preprocessor + """ + super(GlovePreprocessor, self).__init__() + self.config = config + self.tokenizer = configmapper.get_object( + "tokenizers", self.config.main.preprocessor.tokenizer.name + )(**self.config.main.preprocessor.tokenizer.init_params.as_dict()) + self.tokenizer_params = ( + self.config.main.preprocessor.tokenizer.init_vector_params.as_dict() + ) + + self.tokenizer.initialize_vectors(**self.tokenizer_params) + self.embeddings = configmapper.get_object( + "embeddings", self.config.main.preprocessor.embedding.name + )( + self.tokenizer.text_field.vocab.vectors, + self.tokenizer.text_field.vocab.stoi[self.tokenizer.text_field.pad_token], + ) + + def preprocess(self, model_config, data_config): + train_dataset = configmapper.get_object("datasets", data_config.main.name)( + data_config.train, self.tokenizer + ) + val_dataset = configmapper.get_object("datasets", data_config.main.name)( + data_config.val, self.tokenizer + ) + model = configmapper.get_object("models", model_config.name)( + self.embeddings, **model_config.params.as_dict() + ) + + return model, train_dataset, val_dataset + + +@configmapper.map("preprocessors", "clozePreprocessor") +class ClozePreprocessor(Preprocessor): + """GlovePreprocessor.""" + + def __init__(self, config): + """ + Args: + config (src.utils.module.Config): configuration for preprocessor + """ + super(ClozePreprocessor, self).__init__() + self.config = config + self.tokenizer = configmapper.get_object( + "tokenizers", self.config.main.preprocessor.tokenizer.name + ).from_pretrained( + **self.config.main.preprocessor.tokenizer.init_params.as_dict() + ) + + def preprocess(self, model_config, data_config): + train_dataset = configmapper.get_object("datasets", data_config.main.name)( + data_config.train, self.tokenizer + ) + val_dataset = configmapper.get_object("datasets", data_config.main.name)( + data_config.val, self.tokenizer + ) + model = configmapper.get_object("models", model_config.name).from_pretrained( + **model_config.params.as_dict() + ) + + return model, train_dataset, val_dataset + + +@configmapper.map("preprocessors", "transformersConcretenessPreprocessor") +class TransformersConcretenessPreprocessor(Preprocessor): + """BertConcretenessPreprocessor.""" + + def __init__(self, config): + """ + Args: + config (src.utils.module.Config): configuration for preprocessor + """ + super(TransformersConcretenessPreprocessor, self).__init__() + self.config = config + self.tokenizer = configmapper.get_object( + "tokenizers", self.config.main.preprocessor.tokenizer.name + ).from_pretrained( + **self.config.main.preprocessor.tokenizer.init_params.as_dict() + ) + + def preprocess(self, model_config, data_config): + + train_dataset = configmapper.get_object("datasets", data_config.main.name)( + data_config.train, self.tokenizer + ) + val_dataset = configmapper.get_object("datasets", data_config.main.name)( + data_config.val, self.tokenizer + ) + + model = configmapper.get_object("models", model_config.name)( + **model_config.params.as_dict() + ) + + return model, train_dataset, val_dataset diff --git a/src/modules/schedulers.py b/src/modules/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..a41bb049aae0d00349c5493d3a17fcc017b76063 --- /dev/null +++ b/src/modules/schedulers.py @@ -0,0 +1,14 @@ +from torch.optim.lr_scheduler import ( + StepLR, + CosineAnnealingLR, + ReduceLROnPlateau, + CyclicLR, + CosineAnnealingWarmRestarts, +) +from src.utils.mapper import configmapper + +configmapper.map("schedulers", "step")(StepLR) +configmapper.map("schedulers", "cosineanneal")(CosineAnnealingLR) +configmapper.map("schedulers", "reduceplateau")(ReduceLROnPlateau) +configmapper.map("schedulers", "cyclic")(CyclicLR) +configmapper.map("schedulers", "cosineannealrestart")(CosineAnnealingWarmRestarts) diff --git a/src/modules/tokenizers.py b/src/modules/tokenizers.py new file mode 100644 index 0000000000000000000000000000000000000000..b7009261fbce92dad162591c8045cb53be268e0a --- /dev/null +++ b/src/modules/tokenizers.py @@ -0,0 +1,107 @@ +"""Contains tokenizers like GloveTokenizers and BERT Tokenizer.""" + +import torch +# from torchtext.vocab import GloVe +# from torchtext.data import Field, TabularDataset +from src.utils.mapper import configmapper +from transformers import AutoTokenizer + + +class Tokenizer: + """Abstract Class for Tokenizers.""" + + def tokenize(self): + """Abstract Method for tokenization.""" + + +@configmapper.map("tokenizers", "glove") +class GloveTokenizer(Tokenizer): + """Implement GloveTokenizer for tokenizing text for Glove Embeddings. + + Attributes: + embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings. + text_field (torchtext.data.Field): Text_field for vector creation. + + Methods: + __init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method + initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility + /Concreteness Ratings/train/forty.csv", + file_format='tsv', fields=None): Initialize vocab vectors based on data. + + tokenize(x_input, **initializer_params): Tokenize given input and return the output. + """ + + def __init__(self, name="840B", dim="300", cache="../embeddings/"): + """Construct GloveTokenizer. + + Args: + name (str): Name of the GloVe embedding file + dim (str): Dimensions of the Glove embedding file + cache (str): Path to the embeddings directory + """ + super(GloveTokenizer, self).__init__() + self.embeddings = GloVe(name=name, dim=dim, cache=cache) + self.text_field = None + + def initialize_vectors( + self, + fix_length=4, + tokenize="spacy", + tokenizer_file_paths=None, + file_format="tsv", + fields=None, + ): + """Initialize words/sequences based on GloVe embedding. + + Args: + fields (list): The list containing the fields to be taken + and processed from the file (see documentation for + torchtext.data.TabularDataset) + fix_length (int): The length of the tokenized text, + padding or cropping is done accordingly + tokenize (function or string): Method to tokenize the data. + If 'spacy' uses spacy tokenizer, + else the specified method. + tokenizer_file_paths (list of str): The paths of the files containing the data + format (str): The format of the file : 'csv', 'tsv' or 'json' + """ + text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize) + tab_dats = [ + TabularDataset( + i, format=file_format, fields={k: (k, text_field) for k in fields} + ) + for i in tokenizer_file_paths + ] + text_field.build_vocab(*tab_dats) + text_field.vocab.load_vectors(self.embeddings) + self.text_field = text_field + + def tokenize(self, x_input, **init_vector__params): + """Tokenize given input based on initialized vectors. + + Initialize the vectors with given parameters if not already initialized. + + Args: + x_input (str): Unprocessed input text to be tokenized + **initializer_params (Keyword arguments): Parameters to initialize vectors + + Returns: + x_output (str): Processed and tokenized text + """ + if self.text_field is None: + self.initialize_vectors(**init_vector__params) + try: + x_output = torch.squeeze( + self.text_field.process([self.text_field.preprocess(x_input)]) + ) + except Exception as e: + print(x_input) + print(self.text_field.preprocess(x_input)) + print(e) + return x_output + + +@configmapper.map("tokenizers", "AutoTokenizer") +class AutoTokenizer(AutoTokenizer): + def __init__(self, *args): + super(AutoTokenizer, self).__init__() diff --git a/src/trainers/__init__.py b/src/trainers/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/trainers/base_trainer.py b/src/trainers/base_trainer.py new file mode 100644 index 0000000000000000000000000000000000000000..a3867b5ad6a440f2c8f033f57ee8e58f5e731e85 --- /dev/null +++ b/src/trainers/base_trainer.py @@ -0,0 +1,563 @@ +import math +import os +import torch +from src.modules.optimizers import * +from src.modules.embeddings import * +from src.modules.schedulers import * +from src.modules.tokenizers import * +from src.modules.metrics import * +from src.modules.losses import * +from src.utils.misc import * +from src.utils.logger import Logger +from src.utils.mapper import configmapper +from src.utils.configuration import Config + +from torch.utils.data import DataLoader +from tqdm import tqdm + + +@configmapper.map("trainers", "base") +class BaseTrainer: + def __init__(self, config): + self._config = config + self.metrics = { + configmapper.get_object("metrics", metric["type"]): metric["params"] + for metric in self._config.main_config.metrics + } + self.train_config = self._config.train + self.val_config = self._config.val + self.log_label = self.train_config.log.log_label + if self.train_config.log_and_val_interval is not None: + self.val_log_together = True + print("Logging with label: ", self.log_label) + + def train(self, model, train_dataset, val_dataset=None, logger=None): + device = torch.device(self._config.main_config.device.name) + model.to(device) + optim_params = self.train_config.optimizer.params + if optim_params: + optimizer = configmapper.get_object( + "optimizers", self.train_config.optimizer.type + )(model.parameters(), **optim_params.as_dict()) + else: + optimizer = configmapper.get_object( + "optimizers", self.train_config.optimizer.type + )(model.parameters()) + + if self.train_config.scheduler is not None: + scheduler_params = self.train_config.scheduler.params + if scheduler_params: + scheduler = configmapper.get_object( + "schedulers", self.train_config.scheduler.type + )(optimizer, **scheduler_params.as_dict()) + else: + scheduler = configmapper.get_object( + "schedulers", self.train_config.scheduler.type + )(optimizer) + + criterion_params = self.train_config.criterion.params + if criterion_params: + criterion = configmapper.get_object( + "losses", self.train_config.criterion.type + )(**criterion_params.as_dict()) + else: + criterion = configmapper.get_object( + "losses", self.train_config.criterion.type + )() + if "custom_collate_fn" in dir(train_dataset): + train_loader = DataLoader( + dataset=train_dataset, + collate_fn=train_dataset.custom_collate_fn, + **self.train_config.loader_params.as_dict(), + ) + else: + train_loader = DataLoader( + dataset=train_dataset, **self.train_config.loader_params.as_dict() + ) + # train_logger = Logger(**self.train_config.log.logger_params.as_dict()) + + max_epochs = self.train_config.max_epochs + batch_size = self.train_config.loader_params.batch_size + + if self.val_log_together: + val_interval = self.train_config.log_and_val_interval + log_interval = val_interval + else: + val_interval = self.train_config.val_interval + log_interval = self.train_config.log.log_interval + + if logger is None: + train_logger = Logger(**self.train_config.log.logger_params.as_dict()) + else: + train_logger = logger + + train_log_values = self.train_config.log.values.as_dict() + + best_score = ( + -math.inf if self.train_config.save_on.desired == "max" else math.inf + ) + save_on_score = self.train_config.save_on.score + best_step = -1 + best_model = None + + best_hparam_list = None + best_hparam_name_list = None + best_metrics_list = None + best_metrics_name_list = None + + # print("\nTraining\n") + # print(max_steps) + + global_step = 0 + for epoch in range(1, max_epochs + 1): + print( + "Epoch: {}/{}, Global Step: {}".format(epoch, max_epochs, global_step) + ) + train_loss = 0 + val_loss = 0 + + if(self.train_config.label_type=='float'): + all_labels = torch.FloatTensor().to(device) + else: + all_labels = torch.LongTensor().to(device) + + all_outputs = torch.Tensor().to(device) + + train_scores = None + val_scores = None + + pbar = tqdm(total=math.ceil(len(train_dataset) / batch_size)) + pbar.set_description("Epoch " + str(epoch)) + + val_counter = 0 + + for step, batch in enumerate(train_loader): + model.train() + optimizer.zero_grad() + inputs, labels = batch + + if(self.train_config.label_type=='float'): ##Specific to Float Type + labels = labels.float() + + for key in inputs: + inputs[key] = inputs[key].to(device) + labels = labels.to(device) + outputs = model(inputs) + loss = criterion(torch.squeeze(outputs), labels) + loss.backward() + + all_labels = torch.cat((all_labels, labels), 0) + + if (self.train_config.label_type=='float'): + all_outputs = torch.cat((all_outputs, outputs), 0) + else: + all_outputs = torch.cat((all_outputs, torch.argmax(outputs, axis=1)), 0) + + + train_loss += loss.item() + optimizer.step() + + if self.train_config.scheduler is not None: + if isinstance(scheduler, ReduceLROnPlateau): + scheduler.step(train_loss / (step + 1)) + else: + scheduler.step() + + # print(train_loss) + # print(step+1) + + pbar.set_postfix_str(f"Train Loss: {train_loss /(step+1)}") + pbar.update(1) + + global_step += 1 + + # Need to check if we want global_step or local_step + + if val_dataset is not None and (global_step - 1) % val_interval == 0: + # print("\nEvaluating\n") + val_scores = self.val( + model, + val_dataset, + criterion, + device, + global_step, + train_logger, + train_log_values, + ) + + #save_flag = 0 + if self.train_config.save_on is not None: + + ## BEST SCORES UPDATING + + train_scores = self.get_scores( + train_loss, + global_step, + self.train_config.criterion.type, + all_outputs, + all_labels, + ) + + best_score, best_step, save_flag = self.check_best( + val_scores, save_on_score, best_score, global_step + ) + + store_dict = { + "model_state_dict": model.state_dict(), + "best_step": best_step, + "best_score": best_score, + "save_on_score": save_on_score, + } + + path = self.train_config.save_on.best_path.format( + self.log_label + ) + + self.save(store_dict, path, save_flag) + + if save_flag and train_log_values["hparams"] is not None: + ( + best_hparam_list, + best_hparam_name_list, + best_metrics_list, + best_metrics_name_list, + ) = self.update_hparams( + train_scores, val_scores, desc="best_val" + ) + # pbar.close() + if (global_step - 1) % log_interval == 0: + # print("\nLogging\n") + train_loss_name = self.train_config.criterion.type + metric_list = [ + metric(all_labels.cpu(), all_outputs.detach().cpu(), **self.metrics[metric]) + for metric in self.metrics + ] + metric_name_list = [ + metric['type'] for metric in self._config.main_config.metrics + ] + + train_scores = self.log( + train_loss / (step + 1), + train_loss_name, + metric_list, + metric_name_list, + train_logger, + train_log_values, + global_step, + append_text=self.train_config.append_text, + ) + pbar.close() + if not os.path.exists(self.train_config.checkpoint.checkpoint_dir): + os.makedirs(self.train_config.checkpoint.checkpoint_dir) + + if self.train_config.save_after_epoch: + store_dict = { + "model_state_dict": model.state_dict(), + } + + path = f"{self.train_config.checkpoint.checkpoint_dir}_{str(self.train_config.log.log_label)}_{str(epoch)}.pth" + + self.save(store_dict, path, save_flag=1) + + if epoch == max_epochs: + # print("\nEvaluating\n") + val_scores = self.val( + model, + val_dataset, + criterion, + device, + global_step, + train_logger, + train_log_values, + ) + + # print("\nLogging\n") + train_loss_name = self.train_config.criterion.type + metric_list = [ + metric(all_labels.cpu(), all_outputs.detach().cpu(),**self.metrics[metric]) + for metric in self.metrics + ] + metric_name_list = [metric['type'] for metric in self._config.main_config.metrics] + + train_scores = self.log( + train_loss / len(train_loader), + train_loss_name, + metric_list, + metric_name_list, + train_logger, + train_log_values, + global_step, + append_text=self.train_config.append_text, + ) + + if self.train_config.save_on is not None: + + ## BEST SCORES UPDATING + + train_scores = self.get_scores( + train_loss, + len(train_loader), + self.train_config.criterion.type, + all_outputs, + all_labels, + ) + + best_score, best_step, save_flag = self.check_best( + val_scores, save_on_score, best_score, global_step + ) + + store_dict = { + "model_state_dict": model.state_dict(), + "best_step": best_step, + "best_score": best_score, + "save_on_score": save_on_score, + } + + path = self.train_config.save_on.best_path.format(self.log_label) + + self.save(store_dict, path, save_flag) + + if save_flag and train_log_values["hparams"] is not None: + ( + best_hparam_list, + best_hparam_name_list, + best_metrics_list, + best_metrics_name_list, + ) = self.update_hparams(train_scores, val_scores, desc="best_val") + + ## FINAL SCORES UPDATING + STORING + train_scores = self.get_scores( + train_loss, + len(train_loader), + self.train_config.criterion.type, + all_outputs, + all_labels, + ) + + store_dict = { + "model_state_dict": model.state_dict(), + "final_step": global_step, + "final_score": train_scores[save_on_score], + "save_on_score": save_on_score, + } + + path = self.train_config.save_on.final_path.format(self.log_label) + + self.save(store_dict, path, save_flag=1) + if train_log_values["hparams"] is not None: + ( + final_hparam_list, + final_hparam_name_list, + final_metrics_list, + final_metrics_name_list, + ) = self.update_hparams(train_scores, val_scores, desc="final") + train_logger.save_hyperparams( + best_hparam_list, + best_hparam_name_list, + [int(self.log_label),] + best_metrics_list + final_metrics_list, + ["hparams/log_label",] + + best_metrics_name_list + + final_metrics_name_list, + ) + # + + ## Need to check if we want same loggers of different loggers for train and eval + ## Evaluate + + def get_scores(self, loss, divisor, loss_name, all_outputs, all_labels): + + avg_loss = loss / divisor + + metric_list = [ + metric(all_labels.cpu(), all_outputs.detach().cpu(), **self.metrics[metric]) + for metric in self.metrics + ] + metric_name_list = [metric['type'] for metric in self._config.main_config.metrics] + + return dict(zip([loss_name,] + metric_name_list, [avg_loss,] + metric_list,)) + + def check_best(self, val_scores, save_on_score, best_score, global_step): + save_flag = 0 + best_step = global_step + if self.train_config.save_on.desired == "min": + if val_scores[save_on_score] < best_score: + save_flag = 1 + best_score = val_scores[save_on_score] + best_step = global_step + else: + if val_scores[save_on_score] > best_score: + save_flag = 1 + best_score = val_scores[save_on_score] + best_step = global_step + return best_score, best_step, save_flag + + def update_hparams(self, train_scores, val_scores, desc): + hparam_list = [] + hparam_name_list = [] + for hparam in self.train_config.log.values.hparams: + hparam_list.append(get_item_in_config(self._config, hparam["path"])) + if isinstance(hparam_list[-1], Config): + hparam_list[-1] = hparam_list[-1].as_dict() + hparam_name_list.append(hparam["name"]) + + val_keys, val_values = zip(*val_scores.items()) + train_keys, train_values = zip(*train_scores.items()) + val_keys = list(val_keys) + train_keys = list(train_keys) + val_values = list(val_values) + train_values = list(train_values) + for i, key in enumerate(val_keys): + val_keys[i] = f"hparams/{desc}_val_" + val_keys[i] + for i, key in enumerate(train_keys): + train_keys[i] = f"hparams/{desc}_train_" + train_keys[i] + # train_logger.save_hyperparams(hparam_list, hparam_name_list,train_values+val_values,train_keys+val_keys, ) + return ( + hparam_list, + hparam_name_list, + train_values + val_values, + train_keys + val_keys, + ) + + def save(self, store_dict, path, save_flag=0): + if save_flag: + dirs = "/".join(path.split("/")[:-1]) + if not os.path.exists(dirs): + os.makedirs(dirs) + torch.save(store_dict, path) + + def log( + self, + loss, + loss_name, + metric_list, + metric_name_list, + logger, + log_values, + global_step, + append_text, + ): + + return_dic = dict(zip([loss_name,] + metric_name_list, [loss,] + metric_list,)) + + loss_name = f"{append_text}_{self.log_label}_{loss_name}" + if log_values["loss"]: + logger.save_params( + [loss], + [loss_name], + combine=True, + combine_name="losses", + global_step=global_step, + ) + + for i in range(len(metric_name_list)): + metric_name_list[ + i + ] = f"{append_text}_{self.log_label}_{metric_name_list[i]}" + if log_values["metrics"]: + logger.save_params( + metric_list, + metric_name_list, + combine=True, + combine_name="metrics", + global_step=global_step, + ) + # print(hparams_list) + # print(hparam_name_list) + + # for k,v in dict(zip([loss_name],[loss])).items(): + # print(f"{k}:{v}") + # for k,v in dict(zip(metric_name_list,metric_list)).items(): + # print(f"{k}:{v}") + return return_dic + + def val( + self, + model, + dataset, + criterion, + device, + global_step, + train_logger=None, + train_log_values=None, + log=True, + ): + append_text = self.val_config.append_text + if train_logger is not None: + val_logger = train_logger + else: + val_logger = Logger(**self.val_config.log.logger_params.as_dict()) + + if train_log_values is not None: + val_log_values = train_log_values + else: + val_log_values = self.val_config.log.values.as_dict() + if "custom_collate_fn" in dir(dataset): + val_loader = DataLoader( + dataset=dataset, + collate_fn=dataset.custom_collate_fn, + **self.val_config.loader_params.as_dict(), + ) + else: + val_loader = DataLoader( + dataset=dataset, **self.val_config.loader_params.as_dict() + ) + + all_outputs = torch.Tensor().to(device) + if(self.train_config.label_type=='float'): + all_labels = torch.FloatTensor().to(device) + else: + all_labels = torch.LongTensor().to(device) + + batch_size = self.val_config.loader_params.batch_size + + with torch.no_grad(): + model.eval() + val_loss = 0 + for j, batch in enumerate(val_loader): + + inputs, labels = batch + + if(self.train_config.label_type=='float'): + labels = labels.float() + + for key in inputs: + inputs[key] = inputs[key].to(device) + labels = labels.to(device) + + outputs = model(inputs) + loss = criterion(torch.squeeze(outputs), labels) + val_loss += loss.item() + + all_labels = torch.cat((all_labels, labels), 0) + + if (self.train_config.label_type=='float'): + all_outputs = torch.cat((all_outputs, outputs), 0) + else: + all_outputs = torch.cat((all_outputs, torch.argmax(outputs, axis=1)), 0) + + val_loss = val_loss / len(val_loader) + + val_loss_name = self.train_config.criterion.type + + # print(all_outputs, all_labels) + metric_list = [ + metric(all_labels.cpu(), all_outputs.detach().cpu(), **self.metrics[metric]) + for metric in self.metrics + ] + metric_name_list = [metric['type'] for metric in self._config.main_config.metrics] + return_dic = dict( + zip([val_loss_name,] + metric_name_list, [val_loss,] + metric_list,) + ) + if log: + val_scores = self.log( + val_loss, + val_loss_name, + metric_list, + metric_name_list, + val_logger, + val_log_values, + global_step, + append_text, + ) + return val_scores + return return_dic diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/utils/__pycache__/__init__.cpython-38.pyc b/src/utils/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5cf23705da07e60e9505befbf9d3e82ee8960aaf Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-38.pyc differ diff --git a/src/utils/__pycache__/configuration.cpython-38.pyc b/src/utils/__pycache__/configuration.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..83c333b3ea2455fc6e8f8736cff709eac97799e6 Binary files /dev/null and b/src/utils/__pycache__/configuration.cpython-38.pyc differ diff --git a/src/utils/__pycache__/mapper.cpython-38.pyc b/src/utils/__pycache__/mapper.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6ad78799089eba998401b9c2c9c78f678c355f0e Binary files /dev/null and b/src/utils/__pycache__/mapper.cpython-38.pyc differ diff --git a/src/utils/__pycache__/postprocess_predictions.cpython-38.pyc b/src/utils/__pycache__/postprocess_predictions.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4dc51a4c1c3ab7fe4fa3dc9d0aadfde4d79a539 Binary files /dev/null and b/src/utils/__pycache__/postprocess_predictions.cpython-38.pyc differ diff --git a/src/utils/combine_preds.py b/src/utils/combine_preds.py new file mode 100644 index 0000000000000000000000000000000000000000..e7a910422f044e919baa2ff65561e7a5f047526d --- /dev/null +++ b/src/utils/combine_preds.py @@ -0,0 +1,60 @@ +import os +import argparse +from omegaconf import OmegaConf + + +def binary_intersection(lst1, lst2): + lst3 = list(set([value for value in lst1 if value in lst2])) + return lst3 + + +def binary_union(lst1, lst2): + lst3 = list(set(lst1 + lst2)) + return lst3 + + +def combine(files, type="union"): + text = {} + if type == "union": + fn = binary_union + else: + fn = binary_intersection + for fil in files: + with open(fil, "r") as f: + for line in f: + line_split = line.split("\t") + if int(line_split[0]) in text: + text[int(line_split[0])] = fn( + text[int(line_split[0])], eval(line_split[1]) + ) + else: + text[int(line_split[0])] = eval(line_split[1]) + return text + + +def write_dict_to_file(text, path): + with open(path, "w") as f: + for id, spans in text.items(): + if id != len(text) - 1: + f.write(f"{id}\t{str(spans)}\n") + else: + f.write(f"{id}\t{str(spans)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="combine_preds.py", description="Combine span predictions." + ) + parser.add_argument( + "--config", + type=str, + action="store", + help="The configuration for combining predictions.", + ) + args = parser.parse_args() + combine_config = OmegaConf.load(args.config) + text = combine(combine_config.files, combine_config.type) + dir = "/".join(combine_config.path.split("/")[:-1]) + if not os.path.exists(dir): + os.makedirs(dir) + write_dict_to_file(text, combine_config.path) diff --git a/src/utils/combine_preds_3cls.py b/src/utils/combine_preds_3cls.py new file mode 100644 index 0000000000000000000000000000000000000000..3d1d09458fc925f6812228dba03c303c20135e7c --- /dev/null +++ b/src/utils/combine_preds_3cls.py @@ -0,0 +1,82 @@ +import os +import argparse +from omegaconf import OmegaConf + + +def binary_intersection(lst1, lst2): + lst3 = list(set([value for value in lst1 if value in lst2])) + return lst3 + + +def binary_union(lst1, lst2): + lst3 = list(set(lst1 + lst2)) + return lst3 + + +def combine(files, type="union"): + text = {} + if type == "union": + fn = binary_union + else: + fn = binary_intersection + for fil in files: + with open(fil, "r") as f: + for line in f: + line_split = line.split("\t") + if int(line_split[0]) in text: + text[int(line_split[0])] = fn( + text[int(line_split[0])], eval(line_split[1]) + ) + else: + text[int(line_split[0])] = eval(line_split[1]) + return text + + +def combine_I(files, type="union"): + text = {} + if type == "union": + fn = binary_union + else: + fn = binary_intersection + for fil in files: + with open(fil, "r") as f: + for line in f: + line_split = line.split("\t") + if int(line_split[0]) in text: + text[int(line_split[0])] = fn( + text[int(line_split[0])], eval(line_split[2]) + ) + else: + text[int(line_split[0])] = eval(line_split[2]) + return text + + +def write_dict_to_file(text, text_I, path): + with open(path, "w") as f: + for id, spans in text.items(): + # if id != len(text) - 1: + if 1: + f.write(f"{id}\t{str(spans)}\t{str(text_I[id])}\n") + # else: + # f.write(f"{id}\t{str(spans)}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + prog="combine_preds.py", description="Combine span predictions." + ) + parser.add_argument( + "--config", + type=str, + action="store", + help="The configuration for combining predictions.", + ) + args = parser.parse_args() + combine_config = OmegaConf.load(args.config) + text = combine(combine_config.files, combine_config.type) + text_I = combine_I(combine_config.files, combine_config.type) + + dir = "/".join(combine_config.path.split("/")[:-1]) + if not os.path.exists(dir): + os.makedirs(dir) + write_dict_to_file(text, text_I, combine_config.path) diff --git a/src/utils/configuration.py b/src/utils/configuration.py new file mode 100644 index 0000000000000000000000000000000000000000..7213e723414897f112a6dcbbc5a7500ea42d8797 --- /dev/null +++ b/src/utils/configuration.py @@ -0,0 +1,148 @@ +import yaml +import copy +from src.utils.mapper import configmapper + + +def load_yaml(path): + """ + Function to load a yaml file and + return the collected dict(s) + + Parameters + ---------- + path : str + The path to the yaml config file + + Returns + ------- + result : dict + The dictionary from the config file + """ + + assert isinstance(path, str), "Provided path is not a string" + try: + f = open(path, "r") + result = yaml.load(f, Loader=yaml.Loader) + except FileNotFoundError as e: + # Adding this for future functionality + raise e + return result + + +def convert_params_to_dict(params): + dic = {} + for k, v in params.as_dict(): + try: + obj = configmapper.get_object("params", v) + dic[k] = v + except: + print( + f"Undefined {v} for the given key: {k} in mapper ,storing original value" + ) + dic[k] = v + return value + + +class Config: + """Config Class to be used with YAML configuration files + + This class can be used to address keys as attributes. + Ensure that there are no spaces between the keys. + Only objects of type dict can be converted to config. + + Attributes + ---------- + _config : dict, + The dictionary which is formed from the + yaml file or custom dictionary + + Methods + ------- + as_dict(), + Return the config object as dictionary + + Possible update: + ## Can be converted using __getattr__ to use **kwargs + ## with the Config object directly. + + set_value(attr,value) + Set the value of a particular attribute. + """ + + def __init__(self, *, path=None, dic=None): + """ + Initializer for the Config class + + Needs either path or the dict object to create the config + + Parameters + ---------- + path: str, optional + The path to the config YAML file. + Default value is None. + dic : dict, optional + The dictionary containing the configuration. + Default value is None. + """ + if path: + self._config = load_yaml(path) + elif dict: + self._config = dic + else: + raise Exception("Need either path or dict object to instantiate object.") + # self.keys = self._config.keys() + + def __getattr__(self, attr): + """ + Get method for Config class. Helps get keys as attributes. + + Parameters + ---------- + attr: The attribute name passed as .attr + + Returns + ------- + self._config[attr]: object or Config object + The value of the given key if it exists. + If the value is a dict object, + a Config object of that dict is returned. + Otherwise, the exact value is returned. + + Raises + ------ + + KeyError() if the given key is not defined. + """ + if attr in self._config: + if isinstance(self._config[attr], dict): + return Config(dic=self._config[attr]) + else: + return self._config[attr] + else: + raise KeyError(f"Key:{attr} not defined.") + + def set_value(self, attr, value): + """ + Set method for Config class. Helps set keys in the _config. + + Parameters + ---------- + attr: The attribute name passed as .attr + value: The value to be stored as the attr. + """ + self._config[attr] = value + + def __str__(self): + """Function to print the dictionary + contained in the object.""" + return self._config.__str__() + + def __repr__(self): + return f"Config(dic={self._config})" + + def __deepcopy__(self, memo): + return Config(dic=copy.deepcopy(self._config)) + + def as_dict(self): + """Function to get the config as dictionary object""" + return dict(self._config) diff --git a/src/utils/logger.py b/src/utils/logger.py new file mode 100644 index 0000000000000000000000000000000000000000..b03c303e2e4d701d27c924d6cc95ed8196c0f1b8 --- /dev/null +++ b/src/utils/logger.py @@ -0,0 +1,135 @@ +import os +import json +import torch +from torch.autograd import Variable +from torch.utils.tensorboard import SummaryWriter + +# from torchvision.utils import make_grid +# from torchviz import make_dot + + +class Logger: + """""" + + def __init__(self, model, trainer, log_dir, comment=None): + """Initializer for Logger Class + Args: + + """ + self.model_path = os.path.join(log_dir, model, trainer) + self.writer = SummaryWriter(log_dir=self.model_path, comment=comment) + try: + if not os.path.exists(log_dir): + os.makedirs(log_dir) + if not (os.path.exists(self.model_path)): + os.makedirs(self.model_path) + else: + pass + # print("Directory Already Exists.") + except Exception as e: + print(e) + print("Failed to Create Log Directory.") + + def save_params( + self, + param_list, + param_name_list, + epoch=None, + batch_size=None, + batch=None, + combine=False, + combine_name=None, + global_step=None, + ): + if combine == False: + for i in range(len(param_list)): + if isinstance(param_list[i], Variable): + param_list[i] = param_list[i].data.cpu().numpy() + + if global_step is None: + self.writer.add_scalar( + param_name_list[i], + param_list[i], + Logger._global_step(epoch, batch_size, batch), + ) + else: + self.writer.add_scalar( + param_name_list[i], param_list[i], global_step + ) + + else: + scalar_dict = dict(zip(param_name_list, param_list)) + if global_step is None: + self.writer.add_scalars( + combine_name, + scalar_dict, + Logger._global_step(epoch, batch_size, batch), + ) + else: + self.writer.add_scalars(combine_name, scalar_dict, global_step) + + def save_batch_images( + self, image_name, image_batch, epoch, batch_size, batch=None, dataformats="CHW" + ): + self.writer.add_images( + image_name, + image_batch, + Logger._global_step(epoch, batch_size, batch), + dataformats=dataformats, + ) + + def save_prcurve(self, labels, preds, epoch, batch_size, batch=None): + self.writer.add_pr_curve( + "pr_curve", labels, preds, Logger._global_step(epoch, batch_size, batch) + ) + + def save_hyperparams( + self, hparam_list, hparam_name_list, metric_list, metric_name_list + ): + + for i in range(len(hparam_list)): + if isinstance(hparam_list[i], list): + hparam_list[i] = ",".join(list(map(str, hparam_list[i]))) + if isinstance(hparam_list[i], dict): + hparam_list[i] = json.dumps(hparam_list[i]) + if hparam_list[i] is None: + hparam_list[i] = "None" + print(hparam_list, hparam_name_list, metric_list, metric_name_list) + self.writer.add_hparams( + dict(zip(hparam_name_list, hparam_list)), + dict(zip(metric_name_list, metric_list)), + ) + + def save_models(self, model_list, model_names_list, epoch): + for model_name, model in zip(model_names_list, model_list): + torch.save(model.state_dict(), os.path.join(self.model_path, model_name)) + + def save_fig(self, fig, fig_name, epoch, batch_size, batch=None): + self.writer.add_figure( + fig_name, fig, Logger._global_step(epoch, batch_size, batch) + ) + + # def display_params(self, + # params_list, params_name_list, epoch, num_epochs, batch_size, batch + # ): + # for i in range(len(params_list)): + # if isinstance(params_list[i], Variable): + # params_list[i] = params_list[i].data.cpu().numpy() + # print("Epoch: {}/{}, Batch: {}/{}".format(epoch, num_epochs, batch, batch_size)) + # for i in range(len(params_list)): + # print("{}:{}".format(params_name_list[i], params_list[i])) + # + # def draw_model_architecture(self,model, output, input, input_name, save_name): + # make_dot( + # output, params=dict(list(model.named_parameters())) + [(input_name, input)] + # ) + + def close(self): + self.writer.close() + + @staticmethod + def _global_step(epoch, batch_size, batch): + if batch: + return epoch * batch_size + batch + else: + return epoch diff --git a/src/utils/mapper.py b/src/utils/mapper.py new file mode 100644 index 0000000000000000000000000000000000000000..eebcebd3a17cba63df5c2889b1f6664c4732a791 --- /dev/null +++ b/src/utils/mapper.py @@ -0,0 +1,64 @@ +class ConfigMapper: + """Class for creating ConfigMapper objects. + + This class can be used to create custom configuration names using YAML files. + For each class or object instantiated in any modules, + the ConfigMapper object can be used either with the functions, + or as a decorator to store the mapping in the function. + + Attributes + ---------- + + Methods + ------- + + """ + + dicts = { + "models": {}, + "trainers": {}, + "metrics": {}, + "losses": {}, + "optimizers": {}, + "schedulers": {}, + "devices": {}, + "embeddings": {}, + "params": {}, + "datasets": {}, + "preprocessors": {}, + "tokenizers": {}, + } + + @classmethod + def map(cls, key, name): + """ + Map a particular name to an object, in the specified key + + Parameters + ---------- + name : str + The name of the object which will be used. + key : str + The key of the mapper to be used. + """ + + def wrap(obj): + if key in cls.dicts.keys(): + cls.dicts[key][name] = obj + else: + cls.dicts[key] = {} + cls.dicts[key][name] = obj + return obj + + return wrap + + @classmethod + def get_object(cls, key, name): + """""" + try: + return cls.dicts[key][name] + except: + raise NotImplementedError("Key:{name} Undefined".format(name=name)) + + +configmapper = ConfigMapper() diff --git a/src/utils/misc.py b/src/utils/misc.py new file mode 100644 index 0000000000000000000000000000000000000000..f80cfff5f1e8612ecfe54ac2da8da213d952c42f --- /dev/null +++ b/src/utils/misc.py @@ -0,0 +1,154 @@ +"""Miscellaneous utility functions.""" + +import random +import numpy as np +import torch +import copy +import itertools + + +def seed(value=42): + """Set random seed for everything. + + Args: + value (int): Seed + """ + np.random.seed(value) + torch.manual_seed(value) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + random.seed(value) + + +def map_dict_to_obj(dic): + result_dic = {} + if dic is not None: + for k, v in dic.items(): + if isinstance(v, dict): + result_dic[k] = map_dict_to_obj(v) + else: + try: + obj = configmapper.get_object("params", v) + result_dic[k] = obj + except: + result_dic[k] = v + return result_dic + + +def get_item_in_config(config, path): + ## config is a dictionary + curr = config + if isinstance(config, dict): + for step in path: + curr = curr[step] + if curr is None: + break + else: + for step in path: + curr = curr.__getattr__(step) + if curr is None: + break + return curr + + +# init = train_config.grid_search +# curr = get_item_in_config(init,['hyperparams','loader_params']) +# curr.set_value('batch_size',1) +# print(train_config.grid_search) + + +def generate_grid_search_configs(main_config, grid_config, root="hyperparams"): + ## DFS + locations_values_pair = {} + init = grid_config.as_dict() + # print(init) + stack = [root] + visited = [stack[-1]] + + log_label_path = None + hparams_path = None + + # root = init[stack[-1]] + while len(stack) != 0: + root = get_item_in_config(init, stack) + flag = 0 + # print(visited) + # print(stack) + if ( + not isinstance(root, dict) and "hparams" not in stack + ): ## Meaning it is a leaf node + # print(stack) + if isinstance(root, list): + locations_values_pair[ + tuple(copy.deepcopy(stack)) + ] = root ## Append the current stack, and the list values + else: + locations_values_pair[tuple(copy.deepcopy(stack))] = [ + root, + ] ## Append the current stack, and the list values + + _ = stack.pop() ## Pop this root because we don't need it. + else: + if isinstance(root, list) and "hparams" in stack: + hparams_path = copy.deepcopy(stack) + visited.append(".".join(stack)) + stack.pop() + continue + + if "log_label" in root.keys(): + log_label_path = copy.deepcopy( + stack + + [ + "log_label", + ] + ) + + if "log_label" in root.keys(): + log_label_path = copy.deepcopy( + stack + + [ + "log_label", + ] + ) + parent = root ## Otherwise it has children + + for key in parent.keys(): ## For the children + if ( + ".".join( + stack + + [ + key, + ] + ) + not in visited + ): ## Check if I have visited these children + flag = 1 ## If not, we need to repeat the process for this key + stack.append(key) ## Append this key to the stack + visited.append(".".join(stack)) + break + if flag == 0: + stack.pop() + + paths = list(locations_values_pair.keys()) + values = itertools.product(*list(locations_values_pair.values())) + + result_configs = [] + for value in values: + for item_index in range(len(value)): + curr_path = paths[item_index] + curr_item = value[item_index] + + curr_config_item = get_item_in_config(main_config, curr_path[1:-1]) + curr_config_item.set_value(curr_path[-1], curr_item) + + log_item = get_item_in_config(main_config, log_label_path[1:-1]) + log_item.set_value(log_label_path[-1], str(len(result_configs) + 1)) + + hparam_item = get_item_in_config(main_config, hparams_path[1:-1]) + hparam_item.set_value( + hparams_path[-1], + get_item_in_config(grid_config.hyperparams, hparams_path[1:]), + ) + + result_configs.append(copy.deepcopy(main_config)) + return result_configs diff --git a/src/utils/postprocess_predictions.py b/src/utils/postprocess_predictions.py new file mode 100644 index 0000000000000000000000000000000000000000..fec93fb1581116b2f8979bb3e4cfbec30d7dbf20 --- /dev/null +++ b/src/utils/postprocess_predictions.py @@ -0,0 +1,230 @@ +import collections +import numpy as np +from tqdm.auto import tqdm + + +def postprocess_token_span_predictions( + features, + examples, + raw_predictions, + tokenizer, + n_best_size=20, + max_answer_length=30, + squad_v2=False, +): + all_start_logits, all_end_logits, token_logits = raw_predictions + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(list(examples["id"]))} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + predictions = collections.OrderedDict() + + # Logging. + print( + f"Post-processing {len(examples)} example predictions split into {len(features)} features." + ) + + # Let's loop over all the examples! + for example_index in tqdm(range(len(examples))): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_score = None # Only used if squad_v2 is True. + valid_answers = [] + + context = examples[example_index]["context"] + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions in our logits to span of texts in the original + # context. + offset_mapping = features[feature_index]["offset_mapping"] + + # Update minimum null prediction. + cls_index = features[feature_index]["input_ids"].index( + tokenizer.cls_token_id + ) + feature_null_score = start_logits[cls_index] + end_logits[cls_index] + if min_null_score is None or min_null_score < feature_null_score: + min_null_score = feature_null_score + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[ + -1 : -n_best_size - 1 : -1 + ].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or offset_mapping[end_index] is None + ): + continue + # Don't consider answers with a length that is either < 0 or > max_answer_length. + if ( + end_index < start_index + or end_index - start_index + 1 > max_answer_length + ): + continue + + start_char = offset_mapping[start_index][0] + end_char = offset_mapping[end_index][1] + valid_answers.append( + { + "qa_score": ( + start_logits[start_index] + end_logits[end_index] + ) + / 2, + "token_score": np.mean( + [ + token_logits[example_index][token_index][1] + for token_index in range(start_index, end_index + 1) + ] + ), + "score": (start_logits[start_index] + end_logits[end_index]) + / 2 + + np.mean( + [ + token_logits[example_index][token_index][1] + for token_index in range(start_index, end_index + 1) + ] + ), + "text": context[start_char:end_char], + "start": start_char, + "end": end_char, + } + ) + + if len(valid_answers) > 0: + sorted_answers = sorted( + valid_answers, key=lambda x: x["score"], reverse=True + ) + else: + # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid + # failure. + sorted_answers = [{"text": "", "score": 0.0, "start": None, "end": None}] + # Let's pick our final answer: the best one or the null answer (only for squad_v2) + if sorted_answers[0]["score"] <= min_null_score: + sorted_answers = [ + {"text": "", "score": min_null_score, "start": None, "end": None}, + ] + sorted_answers + predictions[examples[example_index]["id"]] = sorted_answers + + return predictions + + +def postprocess_multi_span_predictions( + features, + examples, + raw_predictions, + tokenizer, + n_best_size=20, + max_answer_length=30, + squad_v2=False, +): + + all_start_logits, all_end_logits = raw_predictions + # Build a map example to its corresponding features. + example_id_to_index = {k: i for i, k in enumerate(list(examples["id"]))} + features_per_example = collections.defaultdict(list) + for i, feature in enumerate(features): + features_per_example[example_id_to_index[feature["example_id"]]].append(i) + + # The dictionaries we have to fill. + predictions = collections.OrderedDict() + + # Logging. + print( + f"Post-processing {len(examples)} example predictions split into {len(features)} features." + ) + + # Let's loop over all the examples! + for example_index in tqdm(range(len(examples))): + # Those are the indices of the features associated to the current example. + feature_indices = features_per_example[example_index] + + min_null_score = None # Only used if squad_v2 is True. + valid_answers = [] + + context = examples[example_index]["context"] + # Looping through all the features associated to the current example. + for feature_index in feature_indices: + # We grab the predictions of the model for this feature. + start_logits = all_start_logits[feature_index] + end_logits = all_end_logits[feature_index] + # This is what will allow us to map some the positions + # in our logits to span of texts in the original context. + offset_mapping = features[feature_index]["offset_mapping"] + + # Update minimum null prediction. + cls_index = features[feature_index]["input_ids"].index( + tokenizer.cls_token_id + ) + feature_null_score = start_logits[cls_index] + end_logits[cls_index] + if min_null_score is None or min_null_score < feature_null_score: + min_null_score = feature_null_score + + # Go through all possibilities for the `n_best_size` greater start and end logits. + start_indexes = np.argsort(start_logits)[ + -1 : -n_best_size - 1 : -1 + ].tolist() + end_indexes = np.argsort(end_logits)[-1 : -n_best_size - 1 : -1].tolist() + for start_index in start_indexes: + for end_index in end_indexes: + # Don't consider out-of-scope answers, + # either because the indices are out of bounds or correspond + # to part of the input_ids that are not in the context. + if ( + start_index >= len(offset_mapping) + or end_index >= len(offset_mapping) + or offset_mapping[start_index] is None + or offset_mapping[end_index] is None + ): + continue + # Don't consider answers with a length that + # is either < 0 or > max_answer_length. + if ( + end_index < start_index + or end_index - start_index + 1 > max_answer_length + ): + continue + + start_char = offset_mapping[start_index][0] + end_char = offset_mapping[end_index][1] + valid_answers.append( + { + "score": start_logits[start_index] + end_logits[end_index], + "text": context[start_char:end_char], + "start": start_char, + "end": end_char, + } + ) + + if len(valid_answers) > 0: + sorted_answers = sorted( + valid_answers, key=lambda x: x["score"], reverse=True + ) + else: + # In the very rare edge case we have not a single non-null prediction, + # we create a fake prediction to avoid failure. + sorted_answers = [{"text": "", "score": 0.0, "start": None, "end": None}] + + # Let's pick our final answer: the best one or the null answer (only for squad_v2) + + if sorted_answers[0]["score"] <= min_null_score: + sorted_answers = [ + {"text": "", "score": min_null_score, "start": None, "end": None}, + ] + sorted_answers + + predictions[examples[example_index]["id"]] = sorted_answers + + return predictions \ No newline at end of file diff --git a/src/utils/viz.py b/src/utils/viz.py new file mode 100644 index 0000000000000000000000000000000000000000..c0857de2ff5634c489db2285806ee560c98fcec2 --- /dev/null +++ b/src/utils/viz.py @@ -0,0 +1,102 @@ +"""Visualization utils.""" + +import numpy as np +from IPython.core.display import HTML, display + + +def _get_color(attr): + # clip values to prevent CSS errors (Values should be from [-1,1]) + attr = max(-1, min(1, attr)) + if attr > 0: + hue = 220 + sat = 100 + lig = 100 - int(90 * attr) + else: + hue = 220 + sat = 100 + lig = 100 - int(-125 * attr) + return "hsl({}, {}%, {}%)".format(hue, sat, lig) + + +def format_special_tokens(token): + """Convert <> to # if there are any HTML syntax tags. + + Example: '' will be converted to '#Hello' to avoid confusion + with HTML tags. + + Args: + token (str): The token to be formatted. + Returns: + (str): The formatted token. + """ + if token.startswith("<") and token.endswith(">"): + return "#" + token.strip("<>") + return token + + +def format_word_importances( + words, + importances, + ground_text_spans, + predicted_text_spans, +): + if np.isnan(importances[0]): + importances = np.zeros_like(importances) + + assert len(words) <= len(importances) + tags = ["
Text: "] + + for word_index, (word, importance) in enumerate( + zip(words, importances[: len(words)]) + ): + word = format_special_tokens(word) + for character in word: ## Printing Weird Words + if ord(character) >= 128: + print(word) + break + color = _get_color(importance) + + unwrapped_tag = f' {word}\ + ' + tags.append(unwrapped_tag) + tags.append("
") + tags.append("
Ground Spans: [ ") + for i, span in enumerate(ground_text_spans): + if i != len(ground_text_spans) - 1: + tags.append(f"'{span}',") + else: + tags.append(f"'{span}'") + tags.append(" ]") + tags.append("
Predicted Spans: [ ") + for i, span in enumerate(predicted_text_spans): + if i != len(predicted_text_spans) - 1: + tags.append(f"'{span}',") + else: + tags.append(f"'{span}'") + tags.append(" ]") + return HTML("".join(tags)) + + +def format_word_colors(words, colors): + assert len(words) == len(colors) + tags = ["
"] + for word, color in zip(words, colors): + word = format_special_tokens(word) + unwrapped_tag = ' {word}\ + '.format( + color=color, word=word + ) + tags.append(unwrapped_tag) + tags.append("
") + return HTML("".join(tags)) + + +def display_html(html): + display(html) + + +def save_to_file(html, path): + with open(path, "w") as f: + f.write(html.data)