|
"""Contains tokenizers like GloveTokenizers and BERT Tokenizer.""" |
|
|
|
import torch |
|
|
|
|
|
from src.utils.mapper import configmapper |
|
from transformers import AutoTokenizer |
|
|
|
|
|
class Tokenizer: |
|
"""Abstract Class for Tokenizers.""" |
|
|
|
def tokenize(self): |
|
"""Abstract Method for tokenization.""" |
|
|
|
|
|
@configmapper.map("tokenizers", "glove") |
|
class GloveTokenizer(Tokenizer): |
|
"""Implement GloveTokenizer for tokenizing text for Glove Embeddings. |
|
|
|
Attributes: |
|
embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings. |
|
text_field (torchtext.data.Field): Text_field for vector creation. |
|
|
|
Methods: |
|
__init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method |
|
initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility |
|
/Concreteness Ratings/train/forty.csv", |
|
file_format='tsv', fields=None): Initialize vocab vectors based on data. |
|
|
|
tokenize(x_input, **initializer_params): Tokenize given input and return the output. |
|
""" |
|
|
|
def __init__(self, name="840B", dim="300", cache="../embeddings/"): |
|
"""Construct GloveTokenizer. |
|
|
|
Args: |
|
name (str): Name of the GloVe embedding file |
|
dim (str): Dimensions of the Glove embedding file |
|
cache (str): Path to the embeddings directory |
|
""" |
|
super(GloveTokenizer, self).__init__() |
|
self.embeddings = GloVe(name=name, dim=dim, cache=cache) |
|
self.text_field = None |
|
|
|
def initialize_vectors( |
|
self, |
|
fix_length=4, |
|
tokenize="spacy", |
|
tokenizer_file_paths=None, |
|
file_format="tsv", |
|
fields=None, |
|
): |
|
"""Initialize words/sequences based on GloVe embedding. |
|
|
|
Args: |
|
fields (list): The list containing the fields to be taken |
|
and processed from the file (see documentation for |
|
torchtext.data.TabularDataset) |
|
fix_length (int): The length of the tokenized text, |
|
padding or cropping is done accordingly |
|
tokenize (function or string): Method to tokenize the data. |
|
If 'spacy' uses spacy tokenizer, |
|
else the specified method. |
|
tokenizer_file_paths (list of str): The paths of the files containing the data |
|
format (str): The format of the file : 'csv', 'tsv' or 'json' |
|
""" |
|
text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize) |
|
tab_dats = [ |
|
TabularDataset( |
|
i, format=file_format, fields={k: (k, text_field) for k in fields} |
|
) |
|
for i in tokenizer_file_paths |
|
] |
|
text_field.build_vocab(*tab_dats) |
|
text_field.vocab.load_vectors(self.embeddings) |
|
self.text_field = text_field |
|
|
|
def tokenize(self, x_input, **init_vector__params): |
|
"""Tokenize given input based on initialized vectors. |
|
|
|
Initialize the vectors with given parameters if not already initialized. |
|
|
|
Args: |
|
x_input (str): Unprocessed input text to be tokenized |
|
**initializer_params (Keyword arguments): Parameters to initialize vectors |
|
|
|
Returns: |
|
x_output (str): Processed and tokenized text |
|
""" |
|
if self.text_field is None: |
|
self.initialize_vectors(**init_vector__params) |
|
try: |
|
x_output = torch.squeeze( |
|
self.text_field.process([self.text_field.preprocess(x_input)]) |
|
) |
|
except Exception as e: |
|
print(x_input) |
|
print(self.text_field.preprocess(x_input)) |
|
print(e) |
|
return x_output |
|
|
|
|
|
@configmapper.map("tokenizers", "AutoTokenizer") |
|
class AutoTokenizer(AutoTokenizer): |
|
def __init__(self, *args): |
|
super(AutoTokenizer, self).__init__() |
|
|