""" Augmenter Recipes: =================== Transformations and constraints can be used for simple NLP data augmentations. Here is a list of recipes for NLP data augmentations """ import random from textattack.constraints.pre_transformation import ( RepeatModification, StopwordModification, ) from textattack.constraints.semantics.sentence_encoders import UniversalSentenceEncoder from . import Augmenter DEFAULT_CONSTRAINTS = [RepeatModification(), StopwordModification()] class EasyDataAugmenter(Augmenter): """An implementation of Easy Data Augmentation, which combines: - WordNet synonym replacement - Randomly replace words with their synonyms. - Word deletion - Randomly remove words from the sentence. - Word order swaps - Randomly swap the position of words in the sentence. - Random synonym insertion - Insert a random synonym of a random word at a random location. in one augmentation method. "EDA: Easy Data Augmentation Techniques for Boosting Performance on Text Classification Tasks" (Wei and Zou, 2019) https://arxiv.org/abs/1901.11196 """ def __init__(self, pct_words_to_swap=0.1, transformations_per_example=4): assert 0.0 <= pct_words_to_swap <= 1.0, "pct_words_to_swap must be in [0., 1.]" assert ( transformations_per_example > 0 ), "transformations_per_example must be a positive integer" self.pct_words_to_swap = pct_words_to_swap self.transformations_per_example = transformations_per_example n_aug_each = max(transformations_per_example // 4, 1) self.synonym_replacement = WordNetAugmenter( pct_words_to_swap=pct_words_to_swap, transformations_per_example=n_aug_each, ) self.random_deletion = DeletionAugmenter( pct_words_to_swap=pct_words_to_swap, transformations_per_example=n_aug_each, ) self.random_swap = SwapAugmenter( pct_words_to_swap=pct_words_to_swap, transformations_per_example=n_aug_each, ) self.random_insertion = SynonymInsertionAugmenter( pct_words_to_swap=pct_words_to_swap, transformations_per_example=n_aug_each ) def augment(self, text): augmented_text = [] augmented_text += self.synonym_replacement.augment(text) augmented_text += self.random_deletion.augment(text) augmented_text += self.random_swap.augment(text) augmented_text += self.random_insertion.augment(text) augmented_text = list(set(augmented_text)) random.shuffle(augmented_text) return augmented_text[: self.transformations_per_example] def __repr__(self): return "EasyDataAugmenter" class SwapAugmenter(Augmenter): def __init__(self, **kwargs): from textattack.transformations import WordInnerSwapRandom transformation = WordInnerSwapRandom() super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs) class SynonymInsertionAugmenter(Augmenter): def __init__(self, **kwargs): from textattack.transformations import WordInsertionRandomSynonym transformation = WordInsertionRandomSynonym() super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs) class WordNetAugmenter(Augmenter): """Augments text by replacing with synonyms from the WordNet thesaurus.""" def __init__(self, **kwargs): from textattack.transformations import WordSwapWordNet transformation = WordSwapWordNet() super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs) class DeletionAugmenter(Augmenter): def __init__(self, **kwargs): from textattack.transformations import WordDeletion transformation = WordDeletion() super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs) class EmbeddingAugmenter(Augmenter): """Augments text by transforming words with their embeddings.""" def __init__(self, **kwargs): from textattack.transformations import WordSwapEmbedding transformation = WordSwapEmbedding(max_candidates=50) from textattack.constraints.semantics import WordEmbeddingDistance constraints = DEFAULT_CONSTRAINTS + [WordEmbeddingDistance(min_cos_sim=0.8)] super().__init__(transformation, constraints=constraints, **kwargs) class CharSwapAugmenter(Augmenter): """Augments words by swapping characters out for other characters.""" def __init__(self, **kwargs): from textattack.transformations import ( CompositeTransformation, WordSwapNeighboringCharacterSwap, WordSwapRandomCharacterDeletion, WordSwapRandomCharacterInsertion, WordSwapRandomCharacterSubstitution, ) transformation = CompositeTransformation( [ # (1) Swap: Swap two adjacent letters in the word. WordSwapNeighboringCharacterSwap(), # (2) Substitution: Substitute a letter in the word with a random letter. WordSwapRandomCharacterSubstitution(), # (3) Deletion: Delete a random letter from the word. WordSwapRandomCharacterDeletion(), # (4) Insertion: Insert a random letter in the word. WordSwapRandomCharacterInsertion(), ] ) super().__init__(transformation, constraints=DEFAULT_CONSTRAINTS, **kwargs) class CheckListAugmenter(Augmenter): """Augments words by using the transformation methods provided by CheckList INV testing, which combines: - Name Replacement - Location Replacement - Number Alteration - Contraction/Extension "Beyond Accuracy: Behavioral Testing of NLP models with CheckList" (Ribeiro et al., 2020) https://arxiv.org/abs/2005.04118 """ def __init__(self, **kwargs): from textattack.transformations import ( CompositeTransformation, WordSwapChangeLocation, WordSwapChangeName, WordSwapChangeNumber, WordSwapContract, WordSwapExtend, ) transformation = CompositeTransformation( [ WordSwapChangeNumber(), WordSwapChangeLocation(), WordSwapChangeName(), WordSwapExtend(), WordSwapContract(), ] ) constraints = [DEFAULT_CONSTRAINTS[0]] super().__init__(transformation, constraints=constraints, **kwargs) class CLAREAugmenter(Augmenter): """Li, Zhang, Peng, Chen, Brockett, Sun, Dolan. "Contextualized Perturbation for Textual Adversarial Attack" (Li et al., 2020) https://arxiv.org/abs/2009.07502 CLARE builds on a pre-trained masked language model and modifies the inputs in a contextaware manner. We propose three contextualized perturbations, Replace, Insert and Merge, allowing for generating outputs of varied lengths. """ def __init__( self, model="distilroberta-base", tokenizer="distilroberta-base", **kwargs ): import transformers from textattack.transformations import ( CompositeTransformation, WordInsertionMaskedLM, WordMergeMaskedLM, WordSwapMaskedLM, ) shared_masked_lm = transformers.AutoModelForCausalLM.from_pretrained(model) shared_tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer) transformation = CompositeTransformation( [ WordSwapMaskedLM( method="bae", masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=5e-4, ), WordInsertionMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=0.0, ), WordMergeMaskedLM( masked_language_model=shared_masked_lm, tokenizer=shared_tokenizer, max_candidates=50, min_confidence=5e-3, ), ] ) use_constraint = UniversalSentenceEncoder( threshold=0.7, metric="cosine", compare_against_original=True, window_size=15, skip_text_shorter_than_window=True, ) constraints = DEFAULT_CONSTRAINTS + [use_constraint] super().__init__(transformation, constraints=constraints, **kwargs) class BackTranslationAugmenter(Augmenter): """Sentence level augmentation that uses MarianMTModel to back-translate. https://huggingface.co/transformers/model_doc/marian.html """ def __init__(self, **kwargs): from textattack.transformations.sentence_transformations import BackTranslation transformation = BackTranslation(chained_back_translation=5) super().__init__(transformation, **kwargs)