FacebookAI
/

xlm-roberta-large

Inference Endpoints

Model card Files Files and versions Community

xlm-roberta-large / src /modules /tokenizers.py

shayekh's picture

Upload 61 files

cc9c7ee almost 2 years ago

4.08 kB

	"""Contains tokenizers like GloveTokenizers and BERT Tokenizer."""

	import torch
	# from torchtext.vocab import GloVe
	# from torchtext.data import Field, TabularDataset
	from src.utils.mapper import configmapper
	from transformers import AutoTokenizer


	class Tokenizer:
	"""Abstract Class for Tokenizers."""

	def tokenize(self):
	"""Abstract Method for tokenization."""


	@configmapper.map("tokenizers", "glove")
	class GloveTokenizer(Tokenizer):
	"""Implement GloveTokenizer for tokenizing text for Glove Embeddings.

	Attributes:
	embeddings (torchtext.vocab.Vectors): Loaded pre-trained embeddings.
	text_field (torchtext.data.Field): Text_field for vector creation.

	Methods:
	__init__(self, name='840B', dim='300', cache='../embeddings/') : Constructor method
	initialize_vectors(fix_length=4, tokenize='spacy', file_path="../data/imperceptibility
	/Concreteness Ratings/train/forty.csv",
	file_format='tsv', fields=None): Initialize vocab vectors based on data.

	tokenize(x_input, **initializer_params): Tokenize given input and return the output.
	"""

	def __init__(self, name="840B", dim="300", cache="../embeddings/"):
	"""Construct GloveTokenizer.

	Args:
	name (str): Name of the GloVe embedding file
	dim (str): Dimensions of the Glove embedding file
	cache (str): Path to the embeddings directory
	"""
	super(GloveTokenizer, self).__init__()
	self.embeddings = GloVe(name=name, dim=dim, cache=cache)
	self.text_field = None

	def initialize_vectors(
	self,
	fix_length=4,
	tokenize="spacy",
	tokenizer_file_paths=None,
	file_format="tsv",
	fields=None,
	):
	"""Initialize words/sequences based on GloVe embedding.

	Args:
	fields (list): The list containing the fields to be taken
	and processed from the file (see documentation for
	torchtext.data.TabularDataset)
	fix_length (int): The length of the tokenized text,
	padding or cropping is done accordingly
	tokenize (function or string): Method to tokenize the data.
	If 'spacy' uses spacy tokenizer,
	else the specified method.
	tokenizer_file_paths (list of str): The paths of the files containing the data
	format (str): The format of the file : 'csv', 'tsv' or 'json'
	"""
	text_field = Field(batch_first=True, fix_length=fix_length, tokenize=tokenize)
	tab_dats = [
	TabularDataset(
	i, format=file_format, fields={k: (k, text_field) for k in fields}
	)
	for i in tokenizer_file_paths
	]
	text_field.build_vocab(*tab_dats)
	text_field.vocab.load_vectors(self.embeddings)
	self.text_field = text_field

	def tokenize(self, x_input, **init_vector__params):
	"""Tokenize given input based on initialized vectors.

	Initialize the vectors with given parameters if not already initialized.

	Args:
	x_input (str): Unprocessed input text to be tokenized
	**initializer_params (Keyword arguments): Parameters to initialize vectors

	Returns:
	x_output (str): Processed and tokenized text
	"""
	if self.text_field is None:
	self.initialize_vectors(**init_vector__params)
	try:
	x_output = torch.squeeze(
	self.text_field.process([self.text_field.preprocess(x_input)])
	)
	except Exception as e:
	print(x_input)
	print(self.text_field.preprocess(x_input))
	print(e)
	return x_output


	@configmapper.map("tokenizers", "AutoTokenizer")
	class AutoTokenizer(AutoTokenizer):
	def __init__(self, *args):
	super(AutoTokenizer, self).__init__()