shayekh's picture
Upload 61 files
cc9c7ee
raw
history blame
4.08 kB
"""Contains tokenizers like GloveTokenizers and BERT Tokenizer."""
import torch
# from torchtext.vocab import GloVe
# from torchtext.data import Field, TabularDataset
from src.utils.mapper import configmapper
from transformers import AutoTokenizer
class Tokenizer:
"""Abstract Class for Tokenizers."""
def tokenize(self):
"""Abstract Method for tokenization."""
@configmapper.map("tokenizers", "glove")
class GloveTokenizer(Tokenizer):
"""Implement GloveTokenizer for tokenizing text for Glove Embeddings.
Attributes:
embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings.
text_field (torchtext.data.Field): Text_field for vector creation.
Methods:
__init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method
initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility
/Concreteness Ratings/train/forty.csv",
file_format='tsv', fields=None): Initialize vocab vectors based on data.
tokenize(x_input, **initializer_params): Tokenize given input and return the output.
"""
def __init__(self, name="840B", dim="300", cache="../embeddings/"):
"""Construct GloveTokenizer.
Args:
name (str): Name of the GloVe embedding file
dim (str): Dimensions of the Glove embedding file
cache (str): Path to the embeddings directory
"""
super(GloveTokenizer, self).__init__()
self.embeddings = GloVe(name=name, dim=dim, cache=cache)
self.text_field = None
def initialize_vectors(
self,
fix_length=4,
tokenize="spacy",
tokenizer_file_paths=None,
file_format="tsv",
fields=None,
):
"""Initialize words/sequences based on GloVe embedding.
Args:
fields (list): The list containing the fields to be taken
and processed from the file (see documentation for
torchtext.data.TabularDataset)
fix_length (int): The length of the tokenized text,
padding or cropping is done accordingly
tokenize (function or string): Method to tokenize the data.
If 'spacy' uses spacy tokenizer,
else the specified method.
tokenizer_file_paths (list of str): The paths of the files containing the data
format (str): The format of the file : 'csv', 'tsv' or 'json'
"""
text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize)
tab_dats = [
TabularDataset(
i, format=file_format, fields={k: (k, text_field) for k in fields}
)
for i in tokenizer_file_paths
]
text_field.build_vocab(*tab_dats)
text_field.vocab.load_vectors(self.embeddings)
self.text_field = text_field
def tokenize(self, x_input, **init_vector__params):
"""Tokenize given input based on initialized vectors.
Initialize the vectors with given parameters if not already initialized.
Args:
x_input (str): Unprocessed input text to be tokenized
**initializer_params (Keyword arguments): Parameters to initialize vectors
Returns:
x_output (str): Processed and tokenized text
"""
if self.text_field is None:
self.initialize_vectors(**init_vector__params)
try:
x_output = torch.squeeze(
self.text_field.process([self.text_field.preprocess(x_input)])
)
except Exception as e:
print(x_input)
print(self.text_field.preprocess(x_input))
print(e)
return x_output
@configmapper.map("tokenizers", "AutoTokenizer")
class AutoTokenizer(AutoTokenizer):
def __init__(self, *args):
super(AutoTokenizer, self).__init__()