Upload 61 files

by shayekh - opened Feb 19, 2023

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+3581

-0

This view is limited to 50 files because it contains too many changes. See the raw diff here.

Files changed (50) hide show

src/datasets/__init__.py +7 -0
src/datasets/__pycache__/__init__.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_crf_3cls_tokens.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_crf_tokens.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_multi_spans.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_spans.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_tokens.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_tokens_3cls.cpython-38.pyc +0 -0
src/datasets/__pycache__/toxic_spans_tokens_spans.cpython-38.pyc +0 -0
src/datasets/toxic_spans_crf_3cls_tokens.py +132 -0
src/datasets/toxic_spans_crf_tokens.py +111 -0
src/datasets/toxic_spans_multi_spans.py +237 -0
src/datasets/toxic_spans_spans.py +238 -0
src/datasets/toxic_spans_tokens.py +81 -0
src/datasets/toxic_spans_tokens_3cls.py +102 -0
src/datasets/toxic_spans_tokens_spans.py +269 -0
src/models/__init__.py +7 -0
src/models/__pycache__/__init__.cpython-38.pyc +0 -0
src/models/__pycache__/auto_models.cpython-38.pyc +0 -0
src/models/__pycache__/bert_crf_token.cpython-38.pyc +0 -0
src/models/__pycache__/bert_multi_spans.cpython-38.pyc +0 -0
src/models/__pycache__/bert_token_spans.cpython-38.pyc +0 -0
src/models/__pycache__/roberta_crf_token.cpython-38.pyc +0 -0
src/models/__pycache__/roberta_multi_spans.cpython-38.pyc +0 -0
src/models/__pycache__/roberta_token_spans.cpython-38.pyc +0 -0
src/models/auto_models.py +6 -0
src/models/bert_crf_token.py +72 -0
src/models/bert_multi_spans.py +84 -0
src/models/bert_token_spans.py +100 -0
src/models/roberta_crf_token.py +66 -0
src/models/roberta_multi_spans.py +82 -0
src/models/roberta_token_spans.py +97 -0
src/models/two_layer_nn.py +46 -0
src/modules/__init__.py +0 -0
src/modules/__pycache__/__init__.cpython-38.pyc +0 -0
src/modules/__pycache__/embeddings.cpython-38.pyc +0 -0
src/modules/__pycache__/preprocessors.cpython-38.pyc +0 -0
src/modules/__pycache__/tokenizers.cpython-38.pyc +0 -0
src/modules/activations.py +6 -0
src/modules/embeddings.py +37 -0
src/modules/losses.py +6 -0
src/modules/metrics.py +17 -0
src/modules/optimizers.py +7 -0
src/modules/preprocessors.py +112 -0
src/modules/schedulers.py +14 -0
src/modules/tokenizers.py +107 -0
src/trainers/__init__.py +0 -0
src/trainers/base_trainer.py +563 -0
src/utils/__init__.py +0 -0
src/utils/__pycache__/__init__.cpython-38.pyc +0 -0

src/datasets/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from src.datasets.toxic_spans_tokens import *
+from src.datasets.toxic_spans_tokens_3cls import *
+from src.datasets.toxic_spans_spans import *
+from src.datasets.toxic_spans_tokens_spans import *
+from src.datasets.toxic_spans_multi_spans import *
+from src.datasets.toxic_spans_crf_tokens import *
+from src.datasets.toxic_spans_crf_3cls_tokens import *

src/datasets/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (503 Bytes). View file

src/datasets/__pycache__/toxic_spans_crf_3cls_tokens.cpython-38.pyc ADDED Viewed

Binary file (2.99 kB). View file

src/datasets/__pycache__/toxic_spans_crf_tokens.cpython-38.pyc ADDED Viewed

Binary file (2.76 kB). View file

src/datasets/__pycache__/toxic_spans_multi_spans.cpython-38.pyc ADDED Viewed

Binary file (5.55 kB). View file

src/datasets/__pycache__/toxic_spans_spans.cpython-38.pyc ADDED Viewed

Binary file (5.2 kB). View file

src/datasets/__pycache__/toxic_spans_tokens.cpython-38.pyc ADDED Viewed

Binary file (2.35 kB). View file

src/datasets/__pycache__/toxic_spans_tokens_3cls.cpython-38.pyc ADDED Viewed

Binary file (2.59 kB). View file

src/datasets/__pycache__/toxic_spans_tokens_spans.cpython-38.pyc ADDED Viewed

Binary file (5.97 kB). View file

src/datasets/toxic_spans_crf_3cls_tokens.py ADDED Viewed

	@@ -0,0 +1,132 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+from datasets import load_dataset
+import numpy as np
+@configmapper.map("datasets", "toxic_spans_crf_3cls_tokens")
+class ToxicSpansCRF3ClsTokenDataset:
+    def __init__(self, config):
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        self.tokenized_inputs = self.dataset.map(
+            self.tokenize_and_align_labels_for_train, batched=True
+        )
+        self.test_tokenized_inputs = self.test_dataset.map(
+            self.tokenize_for_test, batched=True
+        )
+    def tokenize_and_align_labels_for_train(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        # tokenized_inputs["text"] = examples["text"]
+        example_spans = []
+        labels = []
+        prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"]))
+        offsets_mapping = tokenized_inputs["offset_mapping"]
+        ## Wrong Code
+        # for i, offset_mapping in enumerate(offsets_mapping):
+        #     j = 0
+        #     while j < len(offset_mapping):  # [tok1, tok2, tok3] [(0,5),(1,4),(5,7)]
+        #         if tokenized_inputs["input_ids"][i][j] in [
+        #             self.tokenizer.sep_token_id,
+        #             self.tokenizer.pad_token_id,
+        #             self.tokenizer.cls_token_id,
+        #         ]:
+        #             j = j + 1
+        #             continue
+        #         else:
+        #             k = j + 1
+        #             while self.tokenizer.convert_ids_to_tokens(
+        #                 tokenized_inputs["input_ids"][i][k]
+        #             ).startswith("##"):
+        #                 offset_mapping[i][j][1] = offset_mapping[i][k][1]
+        #             j = k
+        for i, offset_mapping in enumerate(offsets_mapping):
+            labels.append([])
+            spans = eval(examples["spans"][i])
+            Bs = eval(examples["Bs"][i])
+            Is = eval(examples["Is"][i])
+            example_spans.append(spans)
+            # cls_label = 2  ## DUMMY LABEL
+            cls_label = 3  ## DUMMY LABEL
+            for j, offsets in enumerate(offset_mapping):
+                if tokenized_inputs["input_ids"][i][j] in [
+                    self.tokenizer.sep_token_id,
+                    self.tokenizer.pad_token_id,
+                ]:
+                    tokenized_inputs["attention_mask"][i][j] = 0
+                if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id:
+                    labels[-1].append(cls_label)
+                    prediction_mask[i][j] = 1
+                elif offsets[0] == offsets[1] and offsets[0] == 0:
+                    # labels[-1].append(2)  ## DUMMY
+                    labels[-1].append(cls_label)  ## DUMMY
+                else:
+                    # toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])]
+                    # ## If any part of the the token is in span, mark it as Toxic
+                    # if (
+                    #     len(toxic_offsets) > 0
+                    #     and sum(toxic_offsets) / len(toxic_offsets) > 0.0
+                    # ):
+                    #     labels[-1].append(1)
+                    # else:
+                    #     labels[-1].append(0)
+                    # prediction_mask[i][j] = 1
+                    b_off = [x in Bs for x in range(offsets[0], offsets[1])]
+                    b_off = sum(b_off)
+                    i_off = [x in Is for x in range(offsets[0], offsets[1])]
+                    i_off = sum(i_off)
+                    # if len(b_off) == len(i_off) and len(i_off)  == 0:
+                    if b_off == 0 and i_off == 0:
+                        labels[-1].append(0)
+                    # elif len(b_off) >= len(i_off) == 1:
+                    elif b_off >= i_off:
+                        labels[-1].append(1)
+                        # print(b_off)
+                        # print(i_off)
+                        # print(j)
+                    else:
+                        labels[-1].append(2)
+        tokenized_inputs["labels"] = labels
+        tokenized_inputs["prediction_mask"] = prediction_mask
+        return tokenized_inputs
+    def tokenize_for_test(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"]))
+        labels = np.zeros_like(np.array(tokenized_inputs["input_ids"]))
+        offsets_mapping = tokenized_inputs["offset_mapping"]
+        for i, offset_mapping in enumerate(offsets_mapping):
+            for j, offsets in enumerate(offset_mapping):
+                if tokenized_inputs["input_ids"][i][j] in [
+                    self.tokenizer.sep_token_id,
+                    self.tokenizer.pad_token_id,
+                ]:
+                    tokenized_inputs["attention_mask"][i][j] = 0
+                else:
+                    prediction_mask[i][j] = 1
+        tokenized_inputs["prediction_mask"] = prediction_mask
+        tokenized_inputs["labels"] = labels
+        return tokenized_inputs

src/datasets/toxic_spans_crf_tokens.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+from datasets import load_dataset
+import numpy as np
+@configmapper.map("datasets", "toxic_spans_crf_tokens")
+class ToxicSpansCRFTokenDataset:
+    def __init__(self, config):
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        self.tokenized_inputs = self.dataset.map(
+            self.tokenize_and_align_labels_for_train, batched=True
+        )
+        self.test_tokenized_inputs = self.test_dataset.map(
+            self.tokenize_for_test, batched=True
+        )
+    def tokenize_and_align_labels_for_train(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        # tokenized_inputs["text"] = examples["text"]
+        example_spans = []
+        labels = []
+        prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"]))
+        offsets_mapping = tokenized_inputs["offset_mapping"]
+        ## Wrong Code
+        # for i, offset_mapping in enumerate(offsets_mapping):
+        #     j = 0
+        #     while j < len(offset_mapping):  # [tok1, tok2, tok3] [(0,5),(1,4),(5,7)]
+        #         if tokenized_inputs["input_ids"][i][j] in [
+        #             self.tokenizer.sep_token_id,
+        #             self.tokenizer.pad_token_id,
+        #             self.tokenizer.cls_token_id,
+        #         ]:
+        #             j = j + 1
+        #             continue
+        #         else:
+        #             k = j + 1
+        #             while self.tokenizer.convert_ids_to_tokens(
+        #                 tokenized_inputs["input_ids"][i][k]
+        #             ).startswith("##"):
+        #                 offset_mapping[i][j][1] = offset_mapping[i][k][1]
+        #             j = k
+        for i, offset_mapping in enumerate(offsets_mapping):
+            labels.append([])
+            spans = eval(examples["spans"][i])
+            example_spans.append(spans)
+            cls_label = 2  ## DUMMY LABEL
+            for j, offsets in enumerate(offset_mapping):
+                if tokenized_inputs["input_ids"][i][j] in [
+                    self.tokenizer.sep_token_id,
+                    self.tokenizer.pad_token_id,
+                ]:
+                    tokenized_inputs["attention_mask"][i][j] = 0
+                if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id:
+                    labels[-1].append(cls_label)
+                    prediction_mask[i][j] = 1
+                elif offsets[0] == offsets[1] and offsets[0] == 0:
+                    labels[-1].append(2)  ## DUMMY
+                else:
+                    toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])]
+                    ## If any part of the the token is in span, mark it as Toxic
+                    if (
+                        len(toxic_offsets) > 0
+                        and sum(toxic_offsets) / len(toxic_offsets) > 0.0
+                    ):
+                        labels[-1].append(1)
+                    else:
+                        labels[-1].append(0)
+                    prediction_mask[i][j] = 1
+        tokenized_inputs["labels"] = labels
+        tokenized_inputs["prediction_mask"] = prediction_mask
+        return tokenized_inputs
+    def tokenize_for_test(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        prediction_mask = np.zeros_like(np.array(tokenized_inputs["input_ids"]))
+        labels = np.zeros_like(np.array(tokenized_inputs["input_ids"]))
+        offsets_mapping = tokenized_inputs["offset_mapping"]
+        for i, offset_mapping in enumerate(offsets_mapping):
+            for j, offsets in enumerate(offset_mapping):
+                if tokenized_inputs["input_ids"][i][j] in [
+                    self.tokenizer.sep_token_id,
+                    self.tokenizer.pad_token_id,
+                ]:
+                    tokenized_inputs["attention_mask"][i][j] = 0
+                else:
+                    prediction_mask[i][j] = 1
+        tokenized_inputs["prediction_mask"] = prediction_mask
+        tokenized_inputs["labels"] = labels
+        return tokenized_inputs

src/datasets/toxic_spans_multi_spans.py ADDED Viewed

	@@ -0,0 +1,237 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+import pandas as pd
+from datasets import load_dataset, Dataset
+from evaluation.fix_spans import _contiguous_ranges
+@configmapper.map("datasets", "toxic_spans_multi_spans")
+class ToxicSpansMultiSpansDataset:
+    def __init__(self, config):
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        temp_key_train = list(self.dataset.keys())[0]
+        self.intermediate_dataset = self.dataset.map(
+            self.create_train_features,
+            batched=True,
+            batch_size=1000000,  ##Unusually Large Batch Size ## Needed For Correct ID mapping
+            remove_columns=self.dataset[temp_key_train].column_names,
+        )
+        temp_key_test = list(self.test_dataset.keys())[0]
+        self.intermediate_test_dataset = self.test_dataset.map(
+            self.create_test_features,
+            batched=True,
+            batch_size=1000000,  ##Unusually Large Batch Size ## Needed For Correct ID mapping
+            remove_columns=self.test_dataset[temp_key_test].column_names,
+        )
+        self.tokenized_inputs = self.intermediate_dataset.map(
+            self.prepare_train_features,
+            batched=True,
+            remove_columns=self.intermediate_dataset[temp_key_train].column_names,
+        )
+        self.test_tokenized_inputs = self.intermediate_test_dataset.map(
+            self.prepare_test_features,
+            batched=True,
+            remove_columns=self.intermediate_test_dataset[temp_key_test].column_names,
+        )
+    def create_train_features(self, examples):
+        features = {
+            "context": [],
+            "id": [],
+            "question": [],
+            "title": [],
+            "start_positions": [],
+            "end_positions": [],
+        }
+        id = 0
+        # print(examples)
+        for row_number in range(len(examples["text"])):
+            context = examples["text"][row_number]
+            question = "offense"
+            title = context.split(" ")[0]
+            start_positions = []
+            end_positions = []
+            span = eval(examples["spans"][row_number])
+            contiguous_spans = _contiguous_ranges(span)
+            for lst in contiguous_spans:
+                lst = list(lst)
+                dict_to_write = {}
+                start_positions.append(lst[0])
+                end_positions.append(lst[1])
+            features["context"].append(context)
+            features["id"].append(str(id))
+            features["question"].append(question)
+            features["title"].append(title)
+            features["start_positions"].append(start_positions)
+            features["end_positions"].append(end_positions)
+            id += 1
+        return features
+    def create_test_features(self, examples):
+        features = {"context": [], "id": [], "question": [], "title": []}
+        id = 0
+        for row_number in range(len(examples["text"])):
+            context = examples["text"][row_number]
+            question = "offense"
+            title = context.split(" ")[0]
+            features["context"].append(context)
+            features["id"].append(str(id))
+            features["question"].append(question)
+            features["title"].append(title)
+            id += 1
+        return features
+    def prepare_train_features(self, examples):
+        """Generate tokenized features from examples.
+        Args:
+            examples (dict): The examples to be tokenized.
+        Returns:
+            transformers.tokenization_utils_base.BatchEncoding:
+                The tokenized features/examples after processing.
+        """
+        # Tokenize our examples with truncation and padding, but keep the
+        # overflows using a stride. This results in one example possible
+        # giving several features when a context is long, each of those
+        # features having a context that overlaps a bit the context
+        # of the previous feature.
+        pad_on_right = self.tokenizer.padding_side == "right"
+        print("### Batch Tokenizing Examples ###")
+        tokenized_examples = self.tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            **dict(self.config.tokenizer_params),
+        )
+        # Since one example might give us several features if it has
+        # a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to
+        # character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            # Grab the sequence corresponding to that example
+            # (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            # One example can give several spans, this is the index of
+            # the example containing this span of text.
+            sample_index = sample_mapping[i]
+            start_positions = examples["start_positions"][sample_index]
+            end_positions = examples["end_positions"][sample_index]
+            start_positions_token_wise = [0 for x in range(len(input_ids))]
+            end_positions_token_wise = [0 for x in range(len(input_ids))]
+            # If no answers are given, set the cls_index as answer.
+            if len(start_positions) != 0:
+                for position in range(len(start_positions)):
+                    start_char = start_positions[position]
+                    end_char = end_positions[position] + 1
+                    # Start token index of the current span in the text.
+                    token_start_index = 0
+                    while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                        token_start_index += 1
+                    # End token index of the current span in the text.
+                    token_end_index = len(input_ids) - 1
+                    while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                        token_end_index -= 1
+                    # Detect if the answer is out of the span (in which case we continue).
+                    if not (
+                        offsets[token_start_index][0] <= start_char
+                        and offsets[token_end_index][1] >= end_char
+                    ):
+                        continue
+                    else:
+                        # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
+                        # Note: we could go after the last offset if the answer is the last word (edge case).
+                        while (
+                            token_start_index < len(offsets)
+                            and offsets[token_start_index][0] <= start_char
+                        ):
+                            token_start_index += 1
+                        start_positions_token_wise[token_start_index - 1] = 1
+                        while offsets[token_end_index][1] >= end_char:
+                            token_end_index -= 1
+                        end_positions_token_wise[token_end_index + 1] = 1
+            tokenized_examples["start_positions"].append(start_positions_token_wise)
+            tokenized_examples["end_positions"].append(start_positions_token_wise)
+        return tokenized_examples
+    def prepare_test_features(self, examples):
+        """Generate tokenized validation features from examples.
+        Args:
+            examples (dict): The validation examples to be tokenized.
+        Returns:
+            transformers.tokenization_utils_base.BatchEncoding:
+                The tokenized features/examples for validation set after processing.
+        """
+        # Tokenize our examples with truncation and maybe
+        # padding, but keep the overflows using a stride.
+        # This results in one example possible giving several features
+        # when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        print("### Tokenizing Validation Examples")
+        pad_on_right = self.tokenizer.padding_side == "right"
+        tokenized_examples = self.tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            **dict(self.config.tokenizer_params),
+        )
+        # Since one example might give us several features if it has a long context,
+        #  we need a map from a feature to its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # We keep the example_id that gave us this feature and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example
+            # (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+            # One example can give several spans,
+            # this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(str(examples["id"][sample_index]))
+            # Set to None the offset_mapping that are not part
+            # of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+        return tokenized_examples

src/datasets/toxic_spans_spans.py ADDED Viewed

	@@ -0,0 +1,238 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+import pandas as pd
+from datasets import load_dataset, Dataset
+from evaluation.fix_spans import _contiguous_ranges
+@configmapper.map("datasets", "toxic_spans_spans")
+class ToxicSpansSpansDataset:
+    def __init__(self, config):
+        # print("### ToxicSpansSpansDataset ###"); exit()
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        temp_key_train = list(self.dataset.keys())[0]
+        self.intermediate_dataset = self.dataset.map(
+            self.create_train_features,
+            batched=True,
+            batch_size=1000000,  ##Unusually Large Batch Size ## Needed For Correct ID mapping
+            remove_columns=self.dataset[temp_key_train].column_names,
+        )
+        temp_key_test = list(self.test_dataset.keys())[0]
+        self.intermediate_test_dataset = self.test_dataset.map(
+            self.create_test_features,
+            batched=True,
+            batch_size=1000000,  ##Unusually Large Batch Size ## Needed For Correct ID mapping
+            remove_columns=self.test_dataset[temp_key_test].column_names,
+        )
+        self.tokenized_inputs = self.intermediate_dataset.map(
+            self.prepare_train_features,
+            batched=True,
+            remove_columns=self.intermediate_dataset[temp_key_train].column_names,
+        )
+        self.test_tokenized_inputs = self.intermediate_test_dataset.map(
+            self.prepare_test_features,
+            batched=True,
+            remove_columns=self.intermediate_test_dataset[temp_key_test].column_names,
+        )
+    def create_train_features(self, examples):
+        features = {"context": [], "id": [], "question": [], "title": []}
+        id = 0
+        # print(examples)
+        for row_number in range(len(examples["text"])):
+            context = examples["text"][row_number]
+            # question = "offense"
+            question = "ভুল"
+            title = context.split(" ")[0]
+            span = eval(examples["spans"][row_number])
+            contiguous_spans = _contiguous_ranges(span)
+            for lst in contiguous_spans:
+                lst = list(lst)
+                dict_to_write = {}
+                dict_to_write["answer_start"] = [lst[0]]
+                dict_to_write["text"] = [context[lst[0] : lst[-1] + 1]]
+                # print(dict_to_write)
+                if "answers" in features.keys():
+                    features["answers"].append(dict_to_write)
+                else:
+                    features["answers"] = [
+                        dict_to_write,
+                    ]
+                features["context"].append(context)
+                features["id"].append(str(id))
+                features["question"].append(question)
+                features["title"].append(title)
+                id += 1
+        return features
+    def create_test_features(self, examples):
+        features = {"context": [], "id": [], "question": [], "title": []}
+        id = 0
+        for row_number in range(len(examples["text"])):
+            context = examples["text"][row_number]
+            # question = "offense"
+            question = "ভুল"
+            title = context.split(" ")[0]
+            features["context"].append(context)
+            features["id"].append(str(id))
+            features["question"].append(question)
+            features["title"].append(title)
+            id += 1
+        return features
+    def prepare_train_features(self, examples):
+        """Generate tokenized features from examples.
+        Args:
+            examples (dict): The examples to be tokenized.
+        Returns:
+            transformers.tokenization_utils_base.BatchEncoding:
+                The tokenized features/examples after processing.
+        """
+        # Tokenize our examples with truncation and padding, but keep the
+        # overflows using a stride. This results in one example possible
+        # giving several features when a context is long, each of those
+        # features having a context that overlaps a bit the context
+        # of the previous feature.
+        pad_on_right = self.tokenizer.padding_side == "right"
+        print("### Batch Tokenizing Examples ###")
+        tokenized_examples = self.tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            **dict(self.config.tokenizer_params),
+        )
+        # Since one example might give us several features if it has
+        # a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to
+        # character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        # Let's label those examples!
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            input_ids = tokenized_examples["input_ids"][i]
+            cls_index = input_ids.index(self.tokenizer.cls_token_id)
+            # Grab the sequence corresponding to that example
+            # (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            # One example can give several spans, this is the index of
+            # the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples["answers"][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+                # Detect if the answer is out of the span
+                # (in which case this feature is labeled with the CLS index).
+                if not (
+                    offsets[token_start_index][0] <= start_char
+                    and offsets[token_end_index][1] >= end_char
+                ):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and
+                    # stoken_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset
+                    # if the answer is the last word (edge case).
+                    while (
+                        token_start_index < len(offsets)
+                        and offsets[token_start_index][0] <= start_char
+                    ):
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+        return tokenized_examples
+    def prepare_test_features(self, examples):
+        """Generate tokenized validation features from examples.
+        Args:
+            examples (dict): The validation examples to be tokenized.
+        Returns:
+            transformers.tokenization_utils_base.BatchEncoding:
+                The tokenized features/examples for validation set after processing.
+        """
+        # Tokenize our examples with truncation and maybe
+        # padding, but keep the overflows using a stride.
+        # This results in one example possible giving several features
+        # when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        print("### Tokenizing Validation Examples")
+        pad_on_right = self.tokenizer.padding_side == "right"
+        tokenized_examples = self.tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            **dict(self.config.tokenizer_params),
+        )
+        # Since one example might give us several features if it has a long context,
+        #  we need a map from a feature to its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # We keep the example_id that gave us this feature and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example
+            # (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+            # One example can give several spans,
+            # this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(str(examples["id"][sample_index]))
+            # Set to None the offset_mapping that are not part
+            # of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+        return tokenized_examples

src/datasets/toxic_spans_tokens.py ADDED Viewed

	@@ -0,0 +1,81 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+from datasets import load_dataset
+# import pdb
+@configmapper.map("datasets", "toxic_spans_tokens")
+class ToxicSpansTokenDataset:
+    def __init__(self, config):
+        # print("### ToxicSpansTokenDataset ###"); exit()
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        # if self.config.model_checkpoint_name == "sberbank-ai/mGPT":
+            # self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        self.tokenized_inputs = self.dataset.map(
+            self.tokenize_and_align_labels_for_train, batched=True
+        )
+        self.test_tokenized_inputs = self.test_dataset.map(
+            self.tokenize_for_test, batched=True
+        )
+    def tokenize_and_align_labels_for_train(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        # tokenized_inputs["text"] = examples["text"]
+        example_spans = []
+        labels = []
+        offsets_mapping = tokenized_inputs["offset_mapping"]
+        # pdb.set_trace()
+        for i, offset_mapping in enumerate(offsets_mapping):
+            labels.append([])
+            spans = eval(examples["spans"][i])
+            example_spans.append(spans)
+            if self.config.label_cls:
+                cls_label = (
+                    1
+                    if (
+                        len(examples["text"][i]) > 0
+                        and len(spans) / len(examples["text"][i])
+                        > self.config.cls_threshold
+                    )
+                    else 0
+                )  ## Make class label based on threshold
+            else:
+                cls_label = -100
+            for j, offsets in enumerate(offset_mapping):
+                if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id:
+                    labels[-1].append(cls_label)
+                elif offsets[0] == offsets[1] and offsets[0] == 0: # All zero
+                    labels[-1].append(-100)  ## SPECIAL TOKEN
+                else:
+                    toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])]
+                    ## If any part of the the token is in span, mark it as Toxic
+                    if (
+                        len(toxic_offsets) > 0
+                        and sum(toxic_offsets) / len(toxic_offsets)
+                        > self.config.token_threshold
+                    ):
+                        labels[-1].append(1)
+                    else:
+                        labels[-1].append(0)
+        tokenized_inputs["labels"] = labels
+        # print("tokenized_inputs", tokenized_inputs); exit()
+        return tokenized_inputs
+    def tokenize_for_test(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        return tokenized_inputs

src/datasets/toxic_spans_tokens_3cls.py ADDED Viewed

	@@ -0,0 +1,102 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+from datasets import load_dataset
+import pdb
+@configmapper.map("datasets", "toxic_spans_tokens_3cls")
+class ToxicSpansToken3CLSDataset:
+    def __init__(self, config):
+        # print("### ToxicSpansTokenDataset ###"); exit()
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        # if self.config.model_checkpoint_name == "sberbank-ai/mGPT":
+            # self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        self.tokenized_inputs = self.dataset.map(
+            self.tokenize_and_align_labels_for_train, batched=True
+        )
+        self.test_tokenized_inputs = self.test_dataset.map(
+            self.tokenize_for_test, batched=True
+        )
+    def tokenize_and_align_labels_for_train(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        # tokenized_inputs["text"] = examples["text"]
+        example_spans = []
+        labels = []
+        offsets_mapping = tokenized_inputs["offset_mapping"]
+        # pdb.set_trace()
+        for i, offset_mapping in enumerate(offsets_mapping):
+            labels.append([])
+            spans = eval(examples["spans"][i])
+            Bs = eval(examples["Bs"][i])
+            Is = eval(examples["Is"][i])
+            example_spans.append(spans)
+            if self.config.label_cls:
+                cls_label = (
+                    1
+                    if (
+                        len(examples["text"][i]) > 0
+                        and len(spans) / len(examples["text"][i])
+                        > self.config.cls_threshold
+                    )
+                    else 0
+                )  ## Make class label based on threshold
+            else:
+                cls_label = -100
+            for j, offsets in enumerate(offset_mapping):
+                if tokenized_inputs["input_ids"][i][j] == self.tokenizer.cls_token_id:
+                    labels[-1].append(cls_label)
+                elif offsets[0] == offsets[1] and offsets[0] == 0: # All zero
+                    labels[-1].append(-100)  ## SPECIAL TOKEN
+                else:
+                    # toxic_offsets = [x in spans for x in range(offsets[0], offsets[1])]
+                    ## If any part of the the token is in span, mark it as Toxic
+                    # if (
+                    #     len(toxic_offsets) > 0
+                    #     and sum(toxic_offsets) / len(toxic_offsets)
+                    #     > self.config.token_threshold
+                    # ):
+                    #     labels[-1].append(1)
+                    # else:
+                    #     labels[-1].append(0)
+                    b_off = [x in Bs for x in range(offsets[0], offsets[1])]
+                    b_off = sum(b_off)
+                    i_off = [x in Is for x in range(offsets[0], offsets[1])]
+                    i_off = sum(i_off)
+                    # if len(b_off) == len(i_off) and len(i_off)  == 0:
+                    if b_off == 0 and i_off == 0:
+                        labels[-1].append(0)
+                    # elif len(b_off) >= len(i_off) == 1:
+                    elif b_off >= i_off:
+                        labels[-1].append(1)
+                        # print(b_off)
+                        # print(i_off)
+                        # print(j)
+                    else:
+                        labels[-1].append(2)
+            # pdb.set_trace()
+        tokenized_inputs["labels"] = labels
+        # print("tokenized_inputs", tokenized_inputs); exit()
+        return tokenized_inputs
+    def tokenize_for_test(self, examples):
+        tokenized_inputs = self.tokenizer(
+            examples["text"], **self.config.tokenizer_params
+        )
+        return tokenized_inputs

src/datasets/toxic_spans_tokens_spans.py ADDED Viewed

	@@ -0,0 +1,269 @@

+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+import pandas as pd
+from datasets import load_dataset, Dataset
+from evaluation.fix_spans import _contiguous_ranges
+@configmapper.map("datasets", "toxic_spans_tokens_spans")
+class ToxicSpansTokensSpansDataset:
+    def __init__(self, config):
+        self.config = config
+        self.tokenizer = AutoTokenizer.from_pretrained(
+            self.config.model_checkpoint_name
+        )
+        self.dataset = load_dataset("csv", data_files=dict(self.config.train_files))
+        self.test_dataset = load_dataset("csv", data_files=dict(self.config.eval_files))
+        temp_key_train = list(self.dataset.keys())[0]
+        self.intermediate_dataset = self.dataset.map(
+            self.create_train_features,
+            batched=True,
+            batch_size=1000000,  ##Unusually Large Batch Size ## Needed For Correct ID mapping
+            remove_columns=self.dataset[temp_key_train].column_names,
+        )
+        temp_key_test = list(self.test_dataset.keys())[0]
+        self.intermediate_test_dataset = self.test_dataset.map(
+            self.create_test_features,
+            batched=True,
+            batch_size=1000000,  ##Unusually Large Batch Size ## Needed For Correct ID mapping
+            remove_columns=self.test_dataset[temp_key_test].column_names,
+        )
+        self.tokenized_inputs = self.intermediate_dataset.map(
+            self.prepare_train_features,
+            batched=True,
+            remove_columns=self.intermediate_dataset[temp_key_train].column_names,
+        )
+        self.test_tokenized_inputs = self.intermediate_test_dataset.map(
+            self.prepare_test_features,
+            batched=True,
+            remove_columns=self.intermediate_test_dataset[temp_key_test].column_names,
+        )
+    def create_train_features(self, examples):
+        features = {"context": [], "id": [], "question": [], "title": [], "spans": []}
+        id = 0
+        # print(examples)
+        for row_number in range(len(examples["text"])):
+            context = examples["text"][row_number]
+            question = "offense"
+            title = context.split(" ")[0]
+            span = eval(examples["spans"][row_number])
+            contiguous_spans = _contiguous_ranges(span)
+            for lst in contiguous_spans:
+                lst = list(lst)
+                dict_to_write = {}
+                dict_to_write["answer_start"] = [lst[0]]
+                dict_to_write["text"] = [context[lst[0] : lst[-1] + 1]]
+                # print(dict_to_write)
+                if "answers" in features.keys():
+                    features["answers"].append(dict_to_write)
+                else:
+                    features["answers"] = [
+                        dict_to_write,
+                    ]
+                features["context"].append(context)
+                features["id"].append(str(id))
+                features["question"].append(question)
+                features["title"].append(title)
+                features["spans"].append(span)
+                id += 1
+        return features
+    def create_test_features(self, examples):
+        features = {"context": [], "id": [], "question": [], "title": []}
+        id = 0
+        for row_number in range(len(examples["text"])):
+            context = examples["text"][row_number]
+            question = "offense"
+            title = context.split(" ")[0]
+            features["context"].append(context)
+            features["id"].append(str(id))
+            features["question"].append(question)
+            features["title"].append(title)
+            id += 1
+        return features
+    def prepare_train_features(self, examples):
+        """Generate tokenized features from examples.
+        Args:
+            examples (dict): The examples to be tokenized.
+        Returns:
+            transformers.tokenization_utils_base.BatchEncoding:
+                The tokenized features/examples after processing.
+        """
+        # Tokenize our examples with truncation and padding, but keep the
+        # overflows using a stride. This results in one example possible
+        # giving several features when a context is long, each of those
+        # features having a context that overlaps a bit the context
+        # of the previous feature.
+        pad_on_right = self.tokenizer.padding_side == "right"
+        print("### Batch Tokenizing Examples ###")
+        tokenized_examples = self.tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            **dict(self.config.tokenizer_params),
+        )
+        # Since one example might give us several features if it has
+        # a long context, we need a map from a feature to
+        # its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # The offset mappings will give us a map from token to
+        # character position in the original context. This will
+        # help us compute the start_positions and end_positions.
+        offset_mapping = tokenized_examples.pop("offset_mapping")
+        # Let's label those examples!
+        token_labels = []
+        tokenized_examples["start_positions"] = []
+        tokenized_examples["end_positions"] = []
+        for i, offsets in enumerate(offset_mapping):
+            # We will label impossible answers with the index of the CLS token.
+            token_labels.append([])
+            input_ids = tokenized_examples["input_ids"][i]
+            spans = examples["spans"][i]
+            if self.config.label_cls:
+                cls_label = (
+                    1
+                    if (
+                        len(examples["context"][i]) > 0
+                        and len(spans) / len(examples["context"][i])
+                        > self.config.cls_threshold
+                    )
+                    else 0
+                )  ## Make class label based on threshold
+            else:
+                cls_label = -100
+            for j, offset in enumerate(offsets):
+                if tokenized_examples["input_ids"][i][j] == self.tokenizer.cls_token_id:
+                    token_labels[-1].append(cls_label)
+                elif offset[0] == offset[1] and offset[0] == 0:
+                    token_labels[-1].append(-100)  ## SPECIAL TOKEN
+                else:
+                    toxic_offsets = [x in spans for x in range(offset[0], offset[1])]
+                    ## If any part of the the token is in span, mark it as Toxic
+                    if (
+                        len(toxic_offsets) > 0
+                        and sum(toxic_offsets) / len(toxic_offsets)
+                        > self.config.token_threshold
+                    ):
+                        token_labels[-1].append(1)
+                    else:
+                        token_labels[-1].append(0)
+            cls_index = input_ids.index(self.tokenizer.cls_token_id)
+            # Grab the sequence corresponding to that example
+            # (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            # One example can give several spans, this is the index of
+            # the example containing this span of text.
+            sample_index = sample_mapping[i]
+            answers = examples["answers"][sample_index]
+            # If no answers are given, set the cls_index as answer.
+            if len(answers["answer_start"]) == 0:
+                tokenized_examples["start_positions"].append(cls_index)
+                tokenized_examples["end_positions"].append(cls_index)
+            else:
+                # Start/end character index of the answer in the text.
+                start_char = answers["answer_start"][0]
+                end_char = start_char + len(answers["text"][0])
+                # Start token index of the current span in the text.
+                token_start_index = 0
+                while sequence_ids[token_start_index] != (1 if pad_on_right else 0):
+                    token_start_index += 1
+                # End token index of the current span in the text.
+                token_end_index = len(input_ids) - 1
+                while sequence_ids[token_end_index] != (1 if pad_on_right else 0):
+                    token_end_index -= 1
+                # Detect if the answer is out of the span
+                # (in which case this feature is labeled with the CLS index).
+                if not (
+                    offsets[token_start_index][0] <= start_char
+                    and offsets[token_end_index][1] >= end_char
+                ):
+                    tokenized_examples["start_positions"].append(cls_index)
+                    tokenized_examples["end_positions"].append(cls_index)
+                else:
+                    # Otherwise move the token_start_index and
+                    # stoken_end_index to the two ends of the answer.
+                    # Note: we could go after the last offset
+                    # if the answer is the last word (edge case).
+                    while (
+                        token_start_index < len(offsets)
+                        and offsets[token_start_index][0] <= start_char
+                    ):
+                        token_start_index += 1
+                    tokenized_examples["start_positions"].append(token_start_index - 1)
+                    while offsets[token_end_index][1] >= end_char:
+                        token_end_index -= 1
+                    tokenized_examples["end_positions"].append(token_end_index + 1)
+        tokenized_examples["labels"] = token_labels
+        return tokenized_examples
+    def prepare_test_features(self, examples):
+        """Generate tokenized validation features from examples.
+        Args:
+            examples (dict): The validation examples to be tokenized.
+        Returns:
+            transformers.tokenization_utils_base.BatchEncoding:
+                The tokenized features/examples for validation set after processing.
+        """
+        # Tokenize our examples with truncation and maybe
+        # padding, but keep the overflows using a stride.
+        # This results in one example possible giving several features
+        # when a context is long, each of those features having a
+        # context that overlaps a bit the context of the previous feature.
+        print("### Tokenizing Validation Examples")
+        pad_on_right = self.tokenizer.padding_side == "right"
+        tokenized_examples = self.tokenizer(
+            examples["question" if pad_on_right else "context"],
+            examples["context" if pad_on_right else "question"],
+            **dict(self.config.tokenizer_params),
+        )
+        # Since one example might give us several features if it has a long context,
+        #  we need a map from a feature to its corresponding example. This key gives us just that.
+        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
+        # We keep the example_id that gave us this feature and we will store the offset mappings.
+        tokenized_examples["example_id"] = []
+        for i in range(len(tokenized_examples["input_ids"])):
+            # Grab the sequence corresponding to that example
+            # (to know what is the context and what is the question).
+            sequence_ids = tokenized_examples.sequence_ids(i)
+            context_index = 1 if pad_on_right else 0
+            # One example can give several spans,
+            # this is the index of the example containing this span of text.
+            sample_index = sample_mapping[i]
+            tokenized_examples["example_id"].append(str(examples["id"][sample_index]))
+            # Set to None the offset_mapping that are not part
+            # of the context so it's easy to determine if a token
+            # position is part of the context or not.
+            tokenized_examples["offset_mapping"][i] = [
+                (o if sequence_ids[k] == context_index else None)
+                for k, o in enumerate(tokenized_examples["offset_mapping"][i])
+            ]
+        return tokenized_examples

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from src.models.auto_models import *
+from src.models.bert_token_spans import *
+from src.models.roberta_token_spans import *
+from src.models.bert_multi_spans import *
+from src.models.roberta_multi_spans import *
+from src.models.bert_crf_token import *
+from src.models.roberta_crf_token import *

src/models/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (443 Bytes). View file

src/models/__pycache__/auto_models.cpython-38.pyc ADDED Viewed

Binary file (436 Bytes). View file

src/models/__pycache__/bert_crf_token.cpython-38.pyc ADDED Viewed

Binary file (1.68 kB). View file

src/models/__pycache__/bert_multi_spans.cpython-38.pyc ADDED Viewed

Binary file (1.72 kB). View file

src/models/__pycache__/bert_token_spans.cpython-38.pyc ADDED Viewed

Binary file (2.34 kB). View file

src/models/__pycache__/roberta_crf_token.cpython-38.pyc ADDED Viewed

Binary file (1.69 kB). View file

src/models/__pycache__/roberta_multi_spans.cpython-38.pyc ADDED Viewed

Binary file (1.79 kB). View file

src/models/__pycache__/roberta_token_spans.cpython-38.pyc ADDED Viewed

Binary file (2.42 kB). View file

src/models/auto_models.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from transformers import AutoModelForTokenClassification, AutoModelForQuestionAnswering
+from src.utils.mapper import configmapper
+configmapper.map("models", "autotoken")(AutoModelForTokenClassification)
+configmapper.map("models", "autotoken_3cls")(AutoModelForTokenClassification)
+configmapper.map("models", "autospans")(AutoModelForQuestionAnswering)

src/models/bert_crf_token.py ADDED Viewed

	@@ -0,0 +1,72 @@

+import torch
+# from transformers import BertForTokenClassification
+from transformers import ElectraForTokenClassification
+from torchcrf import CRF
+from src.utils.mapper import configmapper
+# import pdb
+@configmapper.map("models", "bert_crf_token")
+# class BertLSTMCRF(BertForTokenClassification):
+class BertLSTMCRF(ElectraForTokenClassification):
+    def __init__(self, config, lstm_hidden_size, lstm_layers):
+        super().__init__(config)
+        # ipdb.set_trace()
+        self.lstm = torch.nn.LSTM(
+            input_size=config.hidden_size,
+            hidden_size=lstm_hidden_size,
+            num_layers=lstm_layers,
+            dropout=0.2,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.crf = CRF(config.num_labels, batch_first=True)
+        del self.classifier
+        self.classifier = torch.nn.Linear(2 * lstm_hidden_size, config.num_labels)
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        labels=None,
+        prediction_mask=None,
+    ):
+        # pdb.set_trace()
+        # outputs = self.bert(
+        outputs = self.electra(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            output_hidden_states=True,
+            return_dict=False,
+        )
+        # seq_output, all_hidden_states, all_self_attntions, all_cross_attentions
+        sequence_output = outputs[0]  # outputs[1] is pooled output which is none.
+        sequence_output = self.dropout(sequence_output)
+        lstm_out, *_ = self.lstm(sequence_output)
+        sequence_output = self.dropout(lstm_out)
+        logits = self.classifier(sequence_output)
+        ## CRF
+        mask = prediction_mask
+        mask = mask[:, : logits.size(1)].contiguous()
+        # print(logits)
+        if labels is not None:
+            labels = labels[:, : logits.size(1)].contiguous()
+            loss = -self.crf(logits, labels, mask=mask.bool(), reduction="token_mean")
+        tags = self.crf.decode(logits, mask.bool())
+        # print(tags)
+        if labels is not None:
+            return (loss, logits, tags)
+        else:
+            return (logits, tags)

src/models/bert_multi_spans.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss
+# from transformers import BertModel, BertPreTrainedModel
+from transformers import ElectraPreTrainedModel, ElectraModel
+from src.utils.mapper import configmapper
+@configmapper.map("models", "bert_multi_spans")
+# class BertForMultiSpans(BertPreTrainedModel):
+class BertForMultiSpans(ElectraPreTrainedModel):
+    def __init__(self, config):
+        super(BertForMultiSpans, self).__init__(config)
+        # self.bert = BertModel(config)
+        self.bert = ElectraModel(config)
+        self.num_labels = config.num_labels
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=None,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)  # batch_size
+        # print(start_logits.shape, end_logits.shape, start_positions.shape, end_positions.shape)
+        total_loss = None
+        if (
+            start_positions is not None and end_positions is not None
+        ):  # [batch_size/seq_length]
+            # # If we are on multi-GPU, split add a dimension
+            # if len(start_positions.size()) > 1:
+            #     start_positions = start_positions.squeeze(-1)
+            # if len(end_positions.size()) > 1:
+            #     end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            # ignored_index = start_logits.size(1)
+            # start_positions.clamp_(0, ignored_index)
+            # end_positions.clamp_(0, ignored_index)
+            # start_positions = start_logits.view()
+            loss_fct = BCEWithLogitsLoss()
+            start_loss = loss = loss_fct(
+                start_logits,
+                start_positions.float(),
+            )
+            end_loss = loss = loss_fct(
+                end_logits,
+                end_positions.float(),
+            )
+            total_loss = (start_loss + end_loss) / 2
+        output = (start_logits, end_logits) + outputs[2:]
+        return ((total_loss,) + output) if total_loss is not None else output

src/models/bert_token_spans.py ADDED Viewed

	@@ -0,0 +1,100 @@

+import torch.nn as nn
+import torch
+from torch.nn import CrossEntropyLoss
+# from transformers import BertPreTrainedModel, BertModel
+from transformers import ElectraPreTrainedModel, ElectraModel
+from src.utils.mapper import configmapper
+@configmapper.map("models", "bert_token_spans")
+# class BertModelForTokenAndSpans(BertPreTrainedModel):
+class BertModelForTokenAndSpans(ElectraPreTrainedModel):
+    def __init__(self, config, num_token_labels=2, num_qa_labels=2):
+        super(BertModelForTokenAndSpans, self).__init__(config)
+        # self.bert = BertModel(config)
+        self.bert = ElectraModel(config)
+        self.num_token_labels = num_token_labels
+        self.num_qa_labels = num_qa_labels
+        # print("Number of Token Labels: ", num_token_labels); exit()
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_token_labels)
+        self.qa_outputs = nn.Linear(config.hidden_size, num_qa_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        labels=None,  # Token Wise Labels
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=None,
+        )
+        sequence_output = outputs[0]
+        qa_logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = qa_logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        sequence_output = self.dropout(sequence_output)
+        token_logits = self.classifier(sequence_output)
+        total_loss = None
+        if (
+            start_positions is not None
+            and end_positions is not None
+            and labels is not None
+        ):
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            loss_fct = CrossEntropyLoss()
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = token_logits.view(-1, self.num_token_labels)
+                active_labels = torch.where(
+                    active_loss,
+                    labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels),
+                )
+                token_loss = loss_fct(active_logits, active_labels)
+            else:
+                token_loss = loss_fct(
+                    token_logits.view(-1, self.num_token_labels), labels.view(-1)
+                )
+            total_loss = (start_loss + end_loss) / 2 + token_loss
+        output = (start_logits, end_logits, token_logits) + outputs[2:]
+        return ((total_loss,) + output) if total_loss is not None else output

src/models/roberta_crf_token.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import torch
+from transformers import RobertaForTokenClassification
+from torchcrf import CRF
+from src.utils.mapper import configmapper
+@configmapper.map("models", "roberta_crf_token")
+class RobertaLSTMCRF(RobertaForTokenClassification):
+    def __init__(self, config, lstm_hidden_size, lstm_layers):
+        super().__init__(config)
+        self.lstm = torch.nn.LSTM(
+            input_size=config.hidden_size,
+            hidden_size=lstm_hidden_size,
+            num_layers=lstm_layers,
+            dropout=0.2,
+            batch_first=True,
+            bidirectional=True,
+        )
+        self.crf = CRF(config.num_labels, batch_first=True)
+        del self.classifier
+        self.classifier = torch.nn.Linear(2 * lstm_hidden_size, config.num_labels)
+    def forward(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        labels=None,
+        prediction_mask=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            output_hidden_states=True,
+            return_dict=False,
+        )
+        # seq_output, all_hidden_states, all_self_attntions, all_cross_attentions
+        sequence_output = outputs[0]  # outputs[1] is pooled output which is none.
+        sequence_output = self.dropout(sequence_output)
+        lstm_out, *_ = self.lstm(sequence_output)
+        sequence_output = self.dropout(lstm_out)
+        logits = self.classifier(sequence_output)
+        ## CRF
+        mask = prediction_mask
+        mask = mask[:, : logits.size(1)].contiguous()
+        # print(logits)
+        if labels is not None:
+            labels = labels[:, : logits.size(1)].contiguous()
+            loss = -self.crf(logits, labels, mask=mask.bool(), reduction="token_mean")
+        tags = self.crf.decode(logits, mask.bool())
+        # print(tags)
+        if labels is not None:
+            return (loss, logits, tags)
+        else:
+            return (logits, tags)

src/models/roberta_multi_spans.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss
+from transformers import RobertaModel
+from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
+from src.utils.mapper import configmapper
+@configmapper.map("models", "roberta_multi_spans")
+class RobertaForMultiSpans(RobertaPreTrainedModel):
+    def __init__(self, config):
+        super(RobertaForMultiSpans, self).__init__(config)
+        self.roberta = RobertaModel(config)
+        self.num_labels = config.num_labels
+        # TODO check with Google if it's normal there is no dropout on the token classifier of SQuAD in the TF version
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=None,
+        )
+        sequence_output = outputs[0]
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)  # batch_size
+        # print(start_logits.shape, end_logits.shape, start_positions.shape, end_positions.shape)
+        total_loss = None
+        if (
+            start_positions is not None and end_positions is not None
+        ):  # [batch_size/seq_length]
+            # # If we are on multi-GPU, split add a dimension
+            # if len(start_positions.size()) > 1:
+            #     start_positions = start_positions.squeeze(-1)
+            # if len(end_positions.size()) > 1:
+            #     end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            # ignored_index = start_logits.size(1)
+            # start_positions.clamp_(0, ignored_index)
+            # end_positions.clamp_(0, ignored_index)
+            # start_positions = start_logits.view()
+            loss_fct = BCEWithLogitsLoss()
+            start_loss = loss = loss_fct(
+                start_logits,
+                start_positions.float(),
+            )
+            end_loss = loss = loss_fct(
+                end_logits,
+                end_positions.float(),
+            )
+            total_loss = (start_loss + end_loss) / 2
+        output = (start_logits, end_logits) + outputs[2:]
+        return ((total_loss,) + output) if total_loss is not None else output

src/models/roberta_token_spans.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import torch.nn as nn
+import torch
+from torch.nn import CrossEntropyLoss
+from transformers import RobertaModel
+from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel
+from src.utils.mapper import configmapper
+@configmapper.map("models", "roberta_token_spans")
+class RobertaModelForTokenAndSpans(RobertaPreTrainedModel):
+    def __init__(self, config, num_token_labels=2, num_qa_labels=2):
+        super(RobertaModelForTokenAndSpans, self).__init__(config)
+        self.roberta = RobertaModel(config)
+        self.num_token_labels = num_token_labels
+        self.num_qa_labels = num_qa_labels
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, num_token_labels)
+        self.qa_outputs = nn.Linear(config.hidden_size, num_qa_labels)
+        self.init_weights()
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        start_positions=None,
+        end_positions=None,
+        labels=None,  # Token Wise Labels
+        output_attentions=None,
+        output_hidden_states=None,
+    ):
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=None,
+        )
+        sequence_output = outputs[0]
+        qa_logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = qa_logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+        sequence_output = self.dropout(sequence_output)
+        token_logits = self.classifier(sequence_output)
+        total_loss = None
+        if (
+            start_positions is not None
+            and end_positions is not None
+            and labels is not None
+        ):
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            loss_fct = CrossEntropyLoss()
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = token_logits.view(-1, self.num_token_labels)
+                active_labels = torch.where(
+                    active_loss,
+                    labels.view(-1),
+                    torch.tensor(loss_fct.ignore_index).type_as(labels),
+                )
+                token_loss = loss_fct(active_logits, active_labels)
+            else:
+                token_loss = loss_fct(
+                    token_logits.view(-1, self.num_token_labels), labels.view(-1)
+                )
+            total_loss = (start_loss + end_loss) / 2 + token_loss
+        output = (start_logits, end_logits, token_logits) + outputs[2:]
+        return ((total_loss,) + output) if total_loss is not None else output

src/models/two_layer_nn.py ADDED Viewed

	@@ -0,0 +1,46 @@

+"""Implements a two layer Neural Network."""
+from torch.nn import Module, Linear, ReLU
+from src.utils.mapper import configmapper
+@configmapper.map("models", "two_layer_nn")
+class TwoLayerNN(Module):
+    """Implements two layer neural network.
+    Methods:
+        forward(x_input): Returns the output of the neural network.
+    """
+    def __init__(self, embedding, dims):
+        """Construct the two layer Neural Network.
+        This method is used to initialize the two layer neural network,
+        with a given embedding type and corresponding arguments.
+        Args:
+            embedding (torch.nn.Module): The embedding layer for the model.
+            dims (list): List of dimensions for the neural network, input to output.
+        """
+        super(TwoLayerNN, self).__init__()
+        self.embedding = embedding
+        self.linear1 = Linear(dims[0], dims[1])
+        self.relu = ReLU()
+        self.linear2 = Linear(dims[1], dims[2])
+    def forward(self, x_input):
+        """
+        Return the output of the neural network for an input.
+        Args:
+            x_input (torch.Tensor): The input tensor to the neural network.
+        Returns:
+            x_output (torch.Tensor): The output tensor for the neural network.
+        """
+        output = self.embedding(x_input)
+        output = self.linear1(output)
+        output = self.relu(output)
+        x_output = self.linear2(output)
+        return x_output

src/modules/__init__.py ADDED Viewed

File without changes

src/modules/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (166 Bytes). View file

src/modules/__pycache__/embeddings.cpython-38.pyc ADDED Viewed

Binary file (1.67 kB). View file

src/modules/__pycache__/preprocessors.cpython-38.pyc ADDED Viewed

Binary file (3.42 kB). View file

src/modules/__pycache__/tokenizers.cpython-38.pyc ADDED Viewed

Binary file (4.87 kB). View file

src/modules/activations.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import torch.nn as nn
+from src.utils.mapper import configmapper
+configmapper.map("activations", "relu")(nn.ReLU)
+configmapper.map("activations", "logsoftmax")(nn.LogSoftmax)
+configmapper.map("activations", "softmax")(nn.Softmax)

src/modules/embeddings.py ADDED Viewed

	@@ -0,0 +1,37 @@

+"""Contains various kinds of embeddings like Glove, BERT, etc."""
+from torch.nn import Module, Embedding, Flatten
+from src.utils.mapper import configmapper
+@configmapper.map("embeddings", "glove")
+class GloveEmbedding(Module):
+    """Implement Glove based Word Embedding."""
+    def __init__(self, embedding_matrix, padding_idx, static=True):
+        """Construct GloveEmbedding.
+        Args:
+            embedding_matrix (torch.Tensor): The matrix contrainining the embedding weights
+            padding_idx (int): The padding index in the tokenizer.
+            static (bool): Whether or not to freeze embeddings.
+        """
+        super(GloveEmbedding, self).__init__()
+        self.embedding = Embedding.from_pretrained(embedding_matrix)
+        self.embedding.padding_idx = padding_idx
+        if static:
+            self.embedding.weight.required_grad = False
+        self.flatten = Flatten(start_dim=1)
+    def forward(self, x_input):
+        """Pass the input through the embedding.
+        Args:
+            x_input (torch.Tensor): The numericalized tokenized input
+        Returns:
+            x_output (torch.Tensor): The output from the embedding
+        """
+        x_output = self.embedding(x_input)
+        x_output = self.flatten(x_output)
+        return x_output

src/modules/losses.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"All criterion functions."
+from torch.nn import MSELoss, CrossEntropyLoss
+from src.utils.mapper import configmapper
+configmapper.map("losses", "mse")(MSELoss)
+configmapper.map("losses", "CrossEntropyLoss")(CrossEntropyLoss)

src/modules/metrics.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Metrics."""
+from sklearn.metrics import (
+    mean_squared_error,
+    f1_score,
+    precision_score,
+    recall_score,
+    roc_auc_score,
+    accuracy_score,
+)
+from src.utils.mapper import configmapper
+configmapper.map("metrics", "sklearn_f1")(f1_score)
+configmapper.map("metrics", "sklearn_p")(precision_score)
+configmapper.map("metrics", "sklearn_r")(recall_score)
+configmapper.map("metrics", "sklearn_roc")(roc_auc_score)
+configmapper.map("metrics", "sklearn_acc")(accuracy_score)
+configmapper.map("metrics", "sklearn_mse")(mean_squared_error)

src/modules/optimizers.py ADDED Viewed

	@@ -0,0 +1,7 @@

+" Method containing activation functions"
+from torch.optim import Adam, AdamW, SGD
+from src.utils.mapper import configmapper
+configmapper.map("optimizers", "adam")(Adam)
+configmapper.map("optimizers", "adam_w")(AdamW)
+configmapper.map("optimizers", "sgd")(SGD)

src/modules/preprocessors.py ADDED Viewed

	@@ -0,0 +1,112 @@

+from src.modules.tokenizers import *
+from src.modules.embeddings import *
+from src.utils.mapper import configmapper
+class Preprocessor:
+    def preprocess(self):
+        pass
+@configmapper.map("preprocessors", "glove")
+class GlovePreprocessor(Preprocessor):
+    """GlovePreprocessor."""
+    def __init__(self, config):
+        """
+        Args:
+            config (src.utils.module.Config): configuration for preprocessor
+        """
+        super(GlovePreprocessor, self).__init__()
+        self.config = config
+        self.tokenizer = configmapper.get_object(
+            "tokenizers", self.config.main.preprocessor.tokenizer.name
+        )(**self.config.main.preprocessor.tokenizer.init_params.as_dict())
+        self.tokenizer_params = (
+            self.config.main.preprocessor.tokenizer.init_vector_params.as_dict()
+        )
+        self.tokenizer.initialize_vectors(**self.tokenizer_params)
+        self.embeddings = configmapper.get_object(
+            "embeddings", self.config.main.preprocessor.embedding.name
+        )(
+            self.tokenizer.text_field.vocab.vectors,
+            self.tokenizer.text_field.vocab.stoi[self.tokenizer.text_field.pad_token],
+        )
+    def preprocess(self, model_config, data_config):
+        train_dataset = configmapper.get_object("datasets", data_config.main.name)(
+            data_config.train, self.tokenizer
+        )
+        val_dataset = configmapper.get_object("datasets", data_config.main.name)(
+            data_config.val, self.tokenizer
+        )
+        model = configmapper.get_object("models", model_config.name)(
+            self.embeddings, **model_config.params.as_dict()
+        )
+        return model, train_dataset, val_dataset
+@configmapper.map("preprocessors", "clozePreprocessor")
+class ClozePreprocessor(Preprocessor):
+    """GlovePreprocessor."""
+    def __init__(self, config):
+        """
+        Args:
+            config (src.utils.module.Config): configuration for preprocessor
+        """
+        super(ClozePreprocessor, self).__init__()
+        self.config = config
+        self.tokenizer = configmapper.get_object(
+            "tokenizers", self.config.main.preprocessor.tokenizer.name
+        ).from_pretrained(
+            **self.config.main.preprocessor.tokenizer.init_params.as_dict()
+        )
+    def preprocess(self, model_config, data_config):
+        train_dataset = configmapper.get_object("datasets", data_config.main.name)(
+            data_config.train, self.tokenizer
+        )
+        val_dataset = configmapper.get_object("datasets", data_config.main.name)(
+            data_config.val, self.tokenizer
+        )
+        model = configmapper.get_object("models", model_config.name).from_pretrained(
+            **model_config.params.as_dict()
+        )
+        return model, train_dataset, val_dataset
+@configmapper.map("preprocessors", "transformersConcretenessPreprocessor")
+class TransformersConcretenessPreprocessor(Preprocessor):
+    """BertConcretenessPreprocessor."""
+    def __init__(self, config):
+        """
+        Args:
+            config (src.utils.module.Config): configuration for preprocessor
+        """
+        super(TransformersConcretenessPreprocessor, self).__init__()
+        self.config = config
+        self.tokenizer = configmapper.get_object(
+            "tokenizers", self.config.main.preprocessor.tokenizer.name
+        ).from_pretrained(
+            **self.config.main.preprocessor.tokenizer.init_params.as_dict()
+        )
+    def preprocess(self, model_config, data_config):
+        train_dataset = configmapper.get_object("datasets", data_config.main.name)(
+            data_config.train, self.tokenizer
+        )
+        val_dataset = configmapper.get_object("datasets", data_config.main.name)(
+            data_config.val, self.tokenizer
+        )
+        model = configmapper.get_object("models", model_config.name)(
+            **model_config.params.as_dict()
+        )
+        return model, train_dataset, val_dataset

src/modules/schedulers.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from torch.optim.lr_scheduler import (
+    StepLR,
+    CosineAnnealingLR,
+    ReduceLROnPlateau,
+    CyclicLR,
+    CosineAnnealingWarmRestarts,
+)
+from src.utils.mapper import configmapper
+configmapper.map("schedulers", "step")(StepLR)
+configmapper.map("schedulers", "cosineanneal")(CosineAnnealingLR)
+configmapper.map("schedulers", "reduceplateau")(ReduceLROnPlateau)
+configmapper.map("schedulers", "cyclic")(CyclicLR)
+configmapper.map("schedulers", "cosineannealrestart")(CosineAnnealingWarmRestarts)

src/modules/tokenizers.py ADDED Viewed

	@@ -0,0 +1,107 @@

+"""Contains tokenizers like GloveTokenizers and BERT Tokenizer."""
+import torch
+# from torchtext.vocab import GloVe
+# from torchtext.data import Field, TabularDataset
+from src.utils.mapper import configmapper
+from transformers import AutoTokenizer
+class Tokenizer:
+    """Abstract Class for Tokenizers."""
+    def tokenize(self):
+        """Abstract Method for tokenization."""
+@configmapper.map("tokenizers", "glove")
+class GloveTokenizer(Tokenizer):
+    """Implement GloveTokenizer for tokenizing text for Glove Embeddings.
+    Attributes:
+        embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings.
+        text_field (torchtext.data.Field): Text_field for vector creation.
+    Methods:
+        __init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method
+        initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility
+                           /Concreteness Ratings/train/forty.csv",
+                           file_format='tsv', fields=None): Initialize vocab vectors based on data.
+        tokenize(x_input, **initializer_params): Tokenize given input and return the output.
+    """
+    def __init__(self, name="840B", dim="300", cache="../embeddings/"):
+        """Construct GloveTokenizer.
+        Args:
+            name (str): Name of the GloVe embedding file
+            dim (str): Dimensions of the Glove embedding file
+            cache (str): Path to the embeddings directory
+        """
+        super(GloveTokenizer, self).__init__()
+        self.embeddings = GloVe(name=name, dim=dim, cache=cache)
+        self.text_field = None
+    def initialize_vectors(
+        self,
+        fix_length=4,
+        tokenize="spacy",
+        tokenizer_file_paths=None,
+        file_format="tsv",
+        fields=None,
+    ):
+        """Initialize words/sequences based on GloVe embedding.
+        Args:
+            fields (list): The list containing the fields to be taken
+                                     and processed from the file (see documentation for
+                                      torchtext.data.TabularDataset)
+            fix_length (int): The length of the tokenized text,
+                              padding or cropping is done accordingly
+            tokenize (function or string): Method to tokenize the data.
+                                           If 'spacy' uses spacy tokenizer,
+                                           else the specified method.
+            tokenizer_file_paths (list of str): The paths of the files containing the data
+            format (str): The format of the file : 'csv', 'tsv' or 'json'
+        """
+        text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize)
+        tab_dats = [
+            TabularDataset(
+                i, format=file_format, fields={k: (k, text_field) for k in fields}
+            )
+            for i in tokenizer_file_paths
+        ]
+        text_field.build_vocab(*tab_dats)
+        text_field.vocab.load_vectors(self.embeddings)
+        self.text_field = text_field
+    def tokenize(self, x_input, **init_vector__params):
+        """Tokenize given input based on initialized vectors.
+        Initialize the vectors with given parameters if not already initialized.
+        Args:
+            x_input (str): Unprocessed input text to be tokenized
+            **initializer_params (Keyword arguments): Parameters to initialize vectors
+        Returns:
+            x_output (str): Processed and tokenized text
+        """
+        if self.text_field is None:
+            self.initialize_vectors(**init_vector__params)
+        try:
+            x_output = torch.squeeze(
+                self.text_field.process([self.text_field.preprocess(x_input)])
+            )
+        except Exception as e:
+            print(x_input)
+            print(self.text_field.preprocess(x_input))
+            print(e)
+        return x_output
+@configmapper.map("tokenizers", "AutoTokenizer")
+class AutoTokenizer(AutoTokenizer):
+    def __init__(self, *args):
+        super(AutoTokenizer, self).__init__()

src/trainers/__init__.py ADDED Viewed

File without changes

src/trainers/base_trainer.py ADDED Viewed

	@@ -0,0 +1,563 @@

+import math
+import os
+import torch
+from src.modules.optimizers import *
+from src.modules.embeddings import *
+from src.modules.schedulers import *
+from src.modules.tokenizers import *
+from src.modules.metrics import *
+from src.modules.losses import *
+from src.utils.misc import *
+from src.utils.logger import Logger
+from src.utils.mapper import configmapper
+from src.utils.configuration import Config
+from torch.utils.data import DataLoader
+from tqdm import tqdm
+@configmapper.map("trainers", "base")
+class BaseTrainer:
+    def __init__(self, config):
+        self._config = config
+        self.metrics = {
+            configmapper.get_object("metrics", metric["type"]): metric["params"]
+            for metric in self._config.main_config.metrics
+        }
+        self.train_config = self._config.train
+        self.val_config = self._config.val
+        self.log_label = self.train_config.log.log_label
+        if self.train_config.log_and_val_interval is not None:
+            self.val_log_together = True
+        print("Logging with label: ", self.log_label)
+    def train(self, model, train_dataset, val_dataset=None, logger=None):
+        device = torch.device(self._config.main_config.device.name)
+        model.to(device)
+        optim_params = self.train_config.optimizer.params
+        if optim_params:
+            optimizer = configmapper.get_object(
+                "optimizers", self.train_config.optimizer.type
+            )(model.parameters(), **optim_params.as_dict())
+        else:
+            optimizer = configmapper.get_object(
+                "optimizers", self.train_config.optimizer.type
+            )(model.parameters())
+        if self.train_config.scheduler is not None:
+            scheduler_params = self.train_config.scheduler.params
+            if scheduler_params:
+                scheduler = configmapper.get_object(
+                    "schedulers", self.train_config.scheduler.type
+                )(optimizer, **scheduler_params.as_dict())
+            else:
+                scheduler = configmapper.get_object(
+                    "schedulers", self.train_config.scheduler.type
+                )(optimizer)
+        criterion_params = self.train_config.criterion.params
+        if criterion_params:
+            criterion = configmapper.get_object(
+                "losses", self.train_config.criterion.type
+            )(**criterion_params.as_dict())
+        else:
+            criterion = configmapper.get_object(
+                "losses", self.train_config.criterion.type
+            )()
+        if "custom_collate_fn" in dir(train_dataset):
+            train_loader = DataLoader(
+                dataset=train_dataset,
+                collate_fn=train_dataset.custom_collate_fn,
+                **self.train_config.loader_params.as_dict(),
+            )
+        else:
+            train_loader = DataLoader(
+                dataset=train_dataset, **self.train_config.loader_params.as_dict()
+            )
+        # train_logger = Logger(**self.train_config.log.logger_params.as_dict())
+        max_epochs = self.train_config.max_epochs
+        batch_size = self.train_config.loader_params.batch_size
+        if self.val_log_together:
+            val_interval = self.train_config.log_and_val_interval
+            log_interval = val_interval
+        else:
+            val_interval = self.train_config.val_interval
+            log_interval = self.train_config.log.log_interval
+        if logger is None:
+            train_logger = Logger(**self.train_config.log.logger_params.as_dict())
+        else:
+            train_logger = logger
+        train_log_values = self.train_config.log.values.as_dict()
+        best_score = (
+            -math.inf if self.train_config.save_on.desired == "max" else math.inf
+        )
+        save_on_score = self.train_config.save_on.score
+        best_step = -1
+        best_model = None
+        best_hparam_list = None
+        best_hparam_name_list = None
+        best_metrics_list = None
+        best_metrics_name_list = None
+        # print("\nTraining\n")
+        # print(max_steps)
+        global_step = 0
+        for epoch in range(1, max_epochs + 1):
+            print(
+                "Epoch: {}/{}, Global Step: {}".format(epoch, max_epochs, global_step)
+            )
+            train_loss = 0
+            val_loss = 0
+            if(self.train_config.label_type=='float'):
+                all_labels = torch.FloatTensor().to(device)
+            else:
+                all_labels = torch.LongTensor().to(device)
+            all_outputs = torch.Tensor().to(device)
+            train_scores = None
+            val_scores = None
+            pbar = tqdm(total=math.ceil(len(train_dataset) / batch_size))
+            pbar.set_description("Epoch " + str(epoch))
+            val_counter = 0
+            for step, batch in enumerate(train_loader):
+                model.train()
+                optimizer.zero_grad()
+                inputs, labels = batch
+                if(self.train_config.label_type=='float'): ##Specific to Float Type
+                    labels = labels.float()
+                for key in inputs:
+                    inputs[key] = inputs[key].to(device)
+                labels = labels.to(device)
+                outputs = model(inputs)
+                loss = criterion(torch.squeeze(outputs), labels)
+                loss.backward()
+                all_labels = torch.cat((all_labels, labels), 0)
+                if (self.train_config.label_type=='float'):
+                    all_outputs = torch.cat((all_outputs, outputs), 0)
+                else:
+                    all_outputs = torch.cat((all_outputs, torch.argmax(outputs, axis=1)), 0)
+                train_loss += loss.item()
+                optimizer.step()
+                if self.train_config.scheduler is not None:
+                    if isinstance(scheduler, ReduceLROnPlateau):
+                        scheduler.step(train_loss / (step + 1))
+                    else:
+                        scheduler.step()
+                # print(train_loss)
+                # print(step+1)
+                pbar.set_postfix_str(f"Train Loss: {train_loss /(step+1)}")
+                pbar.update(1)
+                global_step += 1
+                # Need to check if we want global_step or local_step
+                if val_dataset is not None and (global_step - 1) % val_interval == 0:
+                    # print("\nEvaluating\n")
+                    val_scores = self.val(
+                        model,
+                        val_dataset,
+                        criterion,
+                        device,
+                        global_step,
+                        train_logger,
+                        train_log_values,
+                    )
+                    #save_flag = 0
+                    if self.train_config.save_on is not None:
+                        ## BEST SCORES UPDATING
+                        train_scores = self.get_scores(
+                            train_loss,
+                            global_step,
+                            self.train_config.criterion.type,
+                            all_outputs,
+                            all_labels,
+                        )
+                        best_score, best_step, save_flag = self.check_best(
+                            val_scores, save_on_score, best_score, global_step
+                        )
+                        store_dict = {
+                            "model_state_dict": model.state_dict(),
+                            "best_step": best_step,
+                            "best_score": best_score,
+                            "save_on_score": save_on_score,
+                        }
+                        path = self.train_config.save_on.best_path.format(
+                            self.log_label
+                        )
+                        self.save(store_dict, path, save_flag)
+                        if save_flag and train_log_values["hparams"] is not None:
+                            (
+                                best_hparam_list,
+                                best_hparam_name_list,
+                                best_metrics_list,
+                                best_metrics_name_list,
+                            ) = self.update_hparams(
+                                train_scores, val_scores, desc="best_val"
+                            )
+                # pbar.close()
+                if (global_step - 1) % log_interval == 0:
+                    # print("\nLogging\n")
+                    train_loss_name = self.train_config.criterion.type
+                    metric_list = [
+                        metric(all_labels.cpu(), all_outputs.detach().cpu(), **self.metrics[metric])
+                        for metric in self.metrics
+                    ]
+                    metric_name_list = [
+                        metric['type'] for metric in self._config.main_config.metrics
+                    ]
+                    train_scores = self.log(
+                        train_loss / (step + 1),
+                        train_loss_name,
+                        metric_list,
+                        metric_name_list,
+                        train_logger,
+                        train_log_values,
+                        global_step,
+                        append_text=self.train_config.append_text,
+                    )
+            pbar.close()
+            if not os.path.exists(self.train_config.checkpoint.checkpoint_dir):
+                os.makedirs(self.train_config.checkpoint.checkpoint_dir)
+            if self.train_config.save_after_epoch:
+                store_dict = {
+                    "model_state_dict": model.state_dict(),
+                }
+                path = f"{self.train_config.checkpoint.checkpoint_dir}_{str(self.train_config.log.log_label)}_{str(epoch)}.pth"
+                self.save(store_dict, path, save_flag=1)
+        if epoch == max_epochs:
+            # print("\nEvaluating\n")
+            val_scores = self.val(
+                model,
+                val_dataset,
+                criterion,
+                device,
+                global_step,
+                train_logger,
+                train_log_values,
+            )
+            # print("\nLogging\n")
+            train_loss_name = self.train_config.criterion.type
+            metric_list = [
+                metric(all_labels.cpu(), all_outputs.detach().cpu(),**self.metrics[metric])
+                for metric in self.metrics
+            ]
+            metric_name_list = [metric['type'] for metric in self._config.main_config.metrics]
+            train_scores = self.log(
+                train_loss / len(train_loader),
+                train_loss_name,
+                metric_list,
+                metric_name_list,
+                train_logger,
+                train_log_values,
+                global_step,
+                append_text=self.train_config.append_text,
+            )
+            if self.train_config.save_on is not None:
+                ## BEST SCORES UPDATING
+                train_scores = self.get_scores(
+                    train_loss,
+                    len(train_loader),
+                    self.train_config.criterion.type,
+                    all_outputs,
+                    all_labels,
+                )
+                best_score, best_step, save_flag = self.check_best(
+                    val_scores, save_on_score, best_score, global_step
+                )
+                store_dict = {
+                    "model_state_dict": model.state_dict(),
+                    "best_step": best_step,
+                    "best_score": best_score,
+                    "save_on_score": save_on_score,
+                }
+                path = self.train_config.save_on.best_path.format(self.log_label)
+                self.save(store_dict, path, save_flag)
+                if save_flag and train_log_values["hparams"] is not None:
+                    (
+                        best_hparam_list,
+                        best_hparam_name_list,
+                        best_metrics_list,
+                        best_metrics_name_list,
+                    ) = self.update_hparams(train_scores, val_scores, desc="best_val")
+                ## FINAL SCORES UPDATING + STORING
+                train_scores = self.get_scores(
+                    train_loss,
+                    len(train_loader),
+                    self.train_config.criterion.type,
+                    all_outputs,
+                    all_labels,
+                )
+                store_dict = {
+                    "model_state_dict": model.state_dict(),
+                    "final_step": global_step,
+                    "final_score": train_scores[save_on_score],
+                    "save_on_score": save_on_score,
+                }
+                path = self.train_config.save_on.final_path.format(self.log_label)
+                self.save(store_dict, path, save_flag=1)
+                if train_log_values["hparams"] is not None:
+                    (
+                        final_hparam_list,
+                        final_hparam_name_list,
+                        final_metrics_list,
+                        final_metrics_name_list,
+                    ) = self.update_hparams(train_scores, val_scores, desc="final")
+                    train_logger.save_hyperparams(
+                        best_hparam_list,
+                        best_hparam_name_list,
+                        [int(self.log_label),] + best_metrics_list + final_metrics_list,
+                        ["hparams/log_label",]
+                        + best_metrics_name_list
+                        + final_metrics_name_list,
+                    )
+                    #
+    ## Need to check if we want same loggers of different loggers for train and eval
+    ## Evaluate
+    def get_scores(self, loss, divisor, loss_name, all_outputs, all_labels):
+        avg_loss = loss / divisor
+        metric_list = [
+            metric(all_labels.cpu(), all_outputs.detach().cpu(), **self.metrics[metric])
+            for metric in self.metrics
+        ]
+        metric_name_list = [metric['type'] for metric in self._config.main_config.metrics]
+        return dict(zip([loss_name,] + metric_name_list, [avg_loss,] + metric_list,))
+    def check_best(self, val_scores, save_on_score, best_score, global_step):
+        save_flag = 0
+        best_step = global_step
+        if self.train_config.save_on.desired == "min":
+            if val_scores[save_on_score] < best_score:
+                save_flag = 1
+                best_score = val_scores[save_on_score]
+                best_step = global_step
+        else:
+            if val_scores[save_on_score] > best_score:
+                save_flag = 1
+                best_score = val_scores[save_on_score]
+                best_step = global_step
+        return best_score, best_step, save_flag
+    def update_hparams(self, train_scores, val_scores, desc):
+        hparam_list = []
+        hparam_name_list = []
+        for hparam in self.train_config.log.values.hparams:
+            hparam_list.append(get_item_in_config(self._config, hparam["path"]))
+            if isinstance(hparam_list[-1], Config):
+                hparam_list[-1] = hparam_list[-1].as_dict()
+            hparam_name_list.append(hparam["name"])
+        val_keys, val_values = zip(*val_scores.items())
+        train_keys, train_values = zip(*train_scores.items())
+        val_keys = list(val_keys)
+        train_keys = list(train_keys)
+        val_values = list(val_values)
+        train_values = list(train_values)
+        for i, key in enumerate(val_keys):
+            val_keys[i] = f"hparams/{desc}_val_" + val_keys[i]
+        for i, key in enumerate(train_keys):
+            train_keys[i] = f"hparams/{desc}_train_" + train_keys[i]
+        # train_logger.save_hyperparams(hparam_list, hparam_name_list,train_values+val_values,train_keys+val_keys, )
+        return (
+            hparam_list,
+            hparam_name_list,
+            train_values + val_values,
+            train_keys + val_keys,
+        )
+    def save(self, store_dict, path, save_flag=0):
+        if save_flag:
+            dirs = "/".join(path.split("/")[:-1])
+            if not os.path.exists(dirs):
+                os.makedirs(dirs)
+            torch.save(store_dict, path)
+    def log(
+        self,
+        loss,
+        loss_name,
+        metric_list,
+        metric_name_list,
+        logger,
+        log_values,
+        global_step,
+        append_text,
+    ):
+        return_dic = dict(zip([loss_name,] + metric_name_list, [loss,] + metric_list,))
+        loss_name = f"{append_text}_{self.log_label}_{loss_name}"
+        if log_values["loss"]:
+            logger.save_params(
+                [loss],
+                [loss_name],
+                combine=True,
+                combine_name="losses",
+                global_step=global_step,
+            )
+        for i in range(len(metric_name_list)):
+            metric_name_list[
+                i
+            ] = f"{append_text}_{self.log_label}_{metric_name_list[i]}"
+        if log_values["metrics"]:
+            logger.save_params(
+                metric_list,
+                metric_name_list,
+                combine=True,
+                combine_name="metrics",
+                global_step=global_step,
+            )
+            # print(hparams_list)
+            # print(hparam_name_list)
+        # for k,v in dict(zip([loss_name],[loss])).items():
+        #     print(f"{k}:{v}")
+        # for k,v in dict(zip(metric_name_list,metric_list)).items():
+        #     print(f"{k}:{v}")
+        return return_dic
+    def val(
+        self,
+        model,
+        dataset,
+        criterion,
+        device,
+        global_step,
+        train_logger=None,
+        train_log_values=None,
+        log=True,
+    ):
+        append_text = self.val_config.append_text
+        if train_logger is not None:
+            val_logger = train_logger
+        else:
+            val_logger = Logger(**self.val_config.log.logger_params.as_dict())
+        if train_log_values is not None:
+            val_log_values = train_log_values
+        else:
+            val_log_values = self.val_config.log.values.as_dict()
+        if "custom_collate_fn" in dir(dataset):
+            val_loader = DataLoader(
+                dataset=dataset,
+                collate_fn=dataset.custom_collate_fn,
+                **self.val_config.loader_params.as_dict(),
+            )
+        else:
+            val_loader = DataLoader(
+                dataset=dataset, **self.val_config.loader_params.as_dict()
+            )
+        all_outputs = torch.Tensor().to(device)
+        if(self.train_config.label_type=='float'):
+            all_labels = torch.FloatTensor().to(device)
+        else:
+            all_labels = torch.LongTensor().to(device)
+        batch_size = self.val_config.loader_params.batch_size
+        with torch.no_grad():
+            model.eval()
+            val_loss = 0
+            for j, batch in enumerate(val_loader):
+                inputs, labels = batch
+                if(self.train_config.label_type=='float'):
+                    labels = labels.float()
+                for key in inputs:
+                    inputs[key] = inputs[key].to(device)
+                labels = labels.to(device)
+                outputs = model(inputs)
+                loss = criterion(torch.squeeze(outputs), labels)
+                val_loss += loss.item()
+                all_labels = torch.cat((all_labels, labels), 0)
+                if (self.train_config.label_type=='float'):
+                    all_outputs = torch.cat((all_outputs, outputs), 0)
+                else:
+                    all_outputs = torch.cat((all_outputs, torch.argmax(outputs, axis=1)), 0)
+            val_loss = val_loss / len(val_loader)
+            val_loss_name = self.train_config.criterion.type
+            # print(all_outputs, all_labels)
+            metric_list = [
+                metric(all_labels.cpu(), all_outputs.detach().cpu(), **self.metrics[metric])
+                for metric in self.metrics
+            ]
+            metric_name_list = [metric['type'] for metric in self._config.main_config.metrics]
+            return_dic = dict(
+                zip([val_loss_name,] + metric_name_list, [val_loss,] + metric_list,)
+            )
+            if log:
+                val_scores = self.log(
+                    val_loss,
+                    val_loss_name,
+                    metric_list,
+                    metric_name_list,
+                    val_logger,
+                    val_log_values,
+                    global_step,
+                    append_text,
+                )
+                return val_scores
+            return return_dic

src/utils/__init__.py ADDED Viewed

File without changes

src/utils/__pycache__/__init__.cpython-38.pyc ADDED Viewed

Binary file (164 Bytes). View file