File size: 6,998 Bytes
3e9388e e3f9170 3e9388e e3f9170 3e9388e e3f9170 3e9388e e3f9170 3e9388e e3f9170 52c6cb9 e3f9170 3e9388e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
"""iBleu metric."""
import datasets
import sacrebleu as scb
from packaging import version
import evaluate
_CITATION = """\
@inproceedings{sun-zhou-2012-joint,
title = "Joint Learning of a Dual {SMT} System for Paraphrase Generation",
author = "Sun, Hong and
Zhou, Ming",
booktitle = "Proceedings of the 50th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)",
month = jul,
year = "2012",
address = "Jeju Island, Korea",
publisher = "Association for Computational Linguistics",
url = "https://aclanthology.org/P12-2008",
pages = "38--42",
}
"""
_DESCRIPTION = """\
iBLEU measures the adequacy and dissimilarity of generated paraphrases.
"""
_KWARGS_DESCRIPTION = """
Produces iBLEU score from an input and a prediction against one or more references.
Args:
inputs (`list` of `str`): list of model inputs. Each input should be tokenized into a list of tokens.
predictions (`list` of `str`): list of translations to score. Each translation should be tokenized into a list of tokens.
references (`list` of `list` of `str`): A list of lists of references. The contents of the first sub-list are the references for the first prediction, the contents of the second sub-list are for the second prediction, etc. Note that there must be the same number of references for each prediction (i.e. all sub-lists must be of the same length).
alpha (`float`): parameter for balancing between adequacy and dissimilarity; smaller α value indicates larger punishment on self-paraphrase.
smooth_method (`str`): The smoothing method to use, defaults to `'exp'`. Possible values are:
- `'none'`: no smoothing
- `'floor'`: increment zero counts
- `'add-k'`: increment num/denom by k for n>1
- `'exp'`: exponential decay
smooth_value (`float`): The smoothing value. Only valid when `smooth_method='floor'` (in which case `smooth_value` defaults to `0.1`) or `smooth_method='add-k'` (in which case `smooth_value` defaults to `1`).
tokenize (`str`): Tokenization method to use for iBLEU. If not provided, defaults to `'zh'` for Chinese, `'ja-mecab'` for Japanese and `'13a'` (mteval) otherwise. Possible values are:
- `'none'`: No tokenization.
- `'zh'`: Chinese tokenization.
- `'13a'`: mimics the `mteval-v13a` script from Moses.
- `'intl'`: International tokenization, mimics the `mteval-v14` script from Moses
- `'char'`: Language-agnostic character-level tokenization.
- `'ja-mecab'`: Japanese tokenization. Uses the [MeCab tokenizer](https://pypi.org/project/mecab-python3).
lowercase (`bool`): If `True`, lowercases the input, enabling case-insensitivity. Defaults to `False`.
force (`bool`): If `True`, insists that your tokenized input is actually detokenized. Defaults to `False`.
use_effective_order (`bool`): If `True`, stops including n-gram orders for which precision is 0. This should be `True`, if sentence-level BLEU will be computed. Defaults to `False`.
Returns:
'score': iBLEU score,
Example:
>>> inputs = ["greetings general kenobi", "foo foo bar bar"]
>>> predictions = ["hello there general kenobi", "foo bar foobar"]
>>> references = [["hello there general kenobi", "hello there !"], ["foo bar foobar", "foo bar foobar"]]
>>> ibleu = evaluate.load("rahular/ibleu")
>>> results = ibleu.compute(inputs=inputs, predictions=predictions, references=references)
>>> print(results)
{'score': 60.41585343630594}
"""
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ibleu(evaluate.Metric):
def _info(self):
if version.parse(scb.__version__) < version.parse("1.4.12"):
raise ImportWarning(
"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
'You can install it with `pip install "sacrebleu>=1.4.12"`.'
)
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=[
datasets.Features(
{
"inputs": datasets.Value("string", id="sequence"),
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Sequence(
datasets.Value("string", id="sequence"), id="references"
),
}
),
datasets.Features(
{
"inputs": datasets.Value("string", id="sequence"),
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
}
),
],
reference_urls=[
"https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
],
)
def _compute(
self,
inputs,
predictions,
references,
alpha=0.7,
smooth_method="exp",
smooth_value=None,
force=False,
lowercase=False,
tokenize=None,
use_effective_order=False,
):
# if only one reference is provided make sure we still use list of lists
if isinstance(references[0], str):
references = [[ref] for ref in references]
# we need to do the same for inputs
if isinstance(inputs[0], str):
inputs = [[inp] for inp in inputs]
else:
raise ValueError("There can be only one input string")
references_per_prediction = len(references[0])
if any(len(refs) != references_per_prediction for refs in references):
raise ValueError("Sacrebleu requires the same number of references for each prediction")
transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]
tgt_bleu = scb.corpus_bleu(
predictions,
transformed_references,
smooth_method=smooth_method,
smooth_value=smooth_value,
force=force,
lowercase=lowercase,
use_effective_order=use_effective_order,
**(dict(tokenize=tokenize) if tokenize else {}),
).score
self_bleu = scb.corpus_bleu(
predictions,
inputs,
smooth_method=smooth_method,
smooth_value=smooth_value,
force=force,
lowercase=lowercase,
use_effective_order=use_effective_order,
**(dict(tokenize=tokenize) if tokenize else {}),
).score
output_dict = {
"score": alpha * tgt_bleu - (1 - alpha) * self_bleu
}
return output_dict |