"""iBleu metric.""" import datasets import sacrebleu as scb from packaging import version import evaluate _DESCRIPTION = """ Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with: Accuracy = (TP + TN) / (TP + TN + FP + FN) Where: TP: True positive TN: True negative FP: False positive FN: False negative """ _KWARGS_DESCRIPTION = """ Args: predictions (`list` of `int`): Predicted labels. references (`list` of `int`): Ground truth labels. normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True. sample_weight (`list` of `float`): Sample weights Defaults to None. Returns: accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy. Examples: Example 1-A simple example >>> accuracy_metric = evaluate.load("accuracy") >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0]) >>> print(results) {'accuracy': 0.5} Example 2-The same as Example 1, except with `normalize` set to `False`. >>> accuracy_metric = evaluate.load("accuracy") >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False) >>> print(results) {'accuracy': 3.0} Example 3-The same as Example 1, except with `sample_weight` set. >>> accuracy_metric = evaluate.load("accuracy") >>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4]) >>> print(results) {'accuracy': 0.8778625954198473} """ _CITATION = """ @article{scikit-learn, title={Scikit-learn: Machine Learning in {P}ython}, author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V. and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P. and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.}, journal={Journal of Machine Learning Research}, volume={12}, pages={2825--2830}, year={2011} } """ @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) class ibleu(evaluate.Metric): def _info(self): if version.parse(scb.__version__) < version.parse("1.4.12"): raise ImportWarning( "To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n" 'You can install it with `pip install "sacrebleu>=1.4.12"`.' ) return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, features=[ datasets.Features( { "inputs": datasets.Value("string", id="sequence"), "predictions": datasets.Value("string", id="sequence"), "references": datasets.Sequence( datasets.Value("string", id="sequence"), id="references" ), } ), datasets.Features( { "inputs": datasets.Value("string", id="sequence"), "predictions": datasets.Value("string", id="sequence"), "references": datasets.Value("string", id="sequence"), } ), ], reference_urls=[ "https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html" ], ) def _compute( self, inputs, predictions, references, alpha=0.7, smooth_method="exp", smooth_value=None, force=False, lowercase=False, tokenize=None, use_effective_order=False, ): # if only one reference is provided make sure we still use list of lists if isinstance(references[0], str): references = [[ref] for ref in references] # we need to do the same for inputs if isinstance(inputs[0], str): inputs = [[inp] for inp in inputs] else: raise ValueError("There can be only one input string") references_per_prediction = len(references[0]) if any(len(refs) != references_per_prediction for refs in references): raise ValueError("Sacrebleu requires the same number of references for each prediction") transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)] tgt_bleu = scb.corpus_bleu( predictions, transformed_references, smooth_method=smooth_method, smooth_value=smooth_value, force=force, lowercase=lowercase, use_effective_order=use_effective_order, **(dict(tokenize=tokenize) if tokenize else {}), ).score self_bleu = scb.corpus_bleu( predictions, inputs, smooth_method=smooth_method, smooth_value=smooth_value, force=force, lowercase=lowercase, use_effective_order=use_effective_order, **(dict(tokenize=tokenize) if tokenize else {}), ).score output_dict = { "score": alpha * tgt_bleu - (1 - alpha) * self_bleu } return output_dict