File size: 2,858 Bytes
4943752
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""

PeekDatasetCommand class
==============================

"""

from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
import collections
import re

import numpy as np

import textattack
from textattack.commands import TextAttackCommand


def _cb(s):
    return textattack.shared.utils.color_text(str(s), color="blue", method="ansi")


logger = textattack.shared.logger


class PeekDatasetCommand(TextAttackCommand):
    """The peek dataset module:

    Takes a peek into a dataset in textattack.
    """

    def run(self, args):
        UPPERCASE_LETTERS_REGEX = re.compile("[A-Z]")

        dataset_args = textattack.DatasetArgs(**vars(args))
        dataset = textattack.DatasetArgs._create_dataset_from_args(dataset_args)

        num_words = []
        attacked_texts = []
        data_all_lowercased = True
        outputs = []
        for inputs, output in dataset:
            at = textattack.shared.AttackedText(inputs)
            if data_all_lowercased:
                # Test if any of the letters in the string are lowercase.
                if re.search(UPPERCASE_LETTERS_REGEX, at.text):
                    data_all_lowercased = False
            attacked_texts.append(at)
            num_words.append(len(at.words))
            outputs.append(output)

        logger.info(f"Number of samples: {_cb(len(attacked_texts))}")
        logger.info("Number of words per input:")
        num_words = np.array(num_words)
        logger.info(f'\t{("total:").ljust(8)} {_cb(num_words.sum())}')
        mean_words = f"{num_words.mean():.2f}"
        logger.info(f'\t{("mean:").ljust(8)} {_cb(mean_words)}')
        std_words = f"{num_words.std():.2f}"
        logger.info(f'\t{("std:").ljust(8)} {_cb(std_words)}')
        logger.info(f'\t{("min:").ljust(8)} {_cb(num_words.min())}')
        logger.info(f'\t{("max:").ljust(8)} {_cb(num_words.max())}')
        logger.info(f"Dataset lowercased: {_cb(data_all_lowercased)}")

        logger.info("First sample:")
        print(attacked_texts[0].printable_text(), "\n")
        logger.info("Last sample:")
        print(attacked_texts[-1].printable_text(), "\n")

        logger.info(f"Found {len(set(outputs))} distinct outputs.")
        if len(outputs) < 20:
            print(sorted(set(outputs)))

        logger.info("Most common outputs:")
        for i, (key, value) in enumerate(collections.Counter(outputs).most_common(20)):
            print("\t", str(key)[:5].ljust(5), f" ({value})")

    @staticmethod
    def register_subcommand(main_parser: ArgumentParser):
        parser = main_parser.add_parser(
            "peek-dataset",
            help="show main statistics about a dataset",
            formatter_class=ArgumentDefaultsHelpFormatter,
        )
        parser = textattack.DatasetArgs._add_parser_args(parser)
        parser.set_defaults(func=PeekDatasetCommand())