Spaces:

rahular
/

ibleu

Runtime error

App Files Files Community

ibleu / ibleu.py

rahular

initial commit

3e9388e almost 2 years ago

raw

history blame

5.87 kB

	"""iBleu metric."""

	import datasets
	import sacrebleu as scb
	from packaging import version

	import evaluate


	_DESCRIPTION = """
	Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
	Accuracy = (TP + TN) / (TP + TN + FP + FN)
	Where:
	TP: True positive
	TN: True negative
	FP: False positive
	FN: False negative
	"""


	_KWARGS_DESCRIPTION = """
	Args:
	predictions (`list` of `int`): Predicted labels.
	references (`list` of `int`): Ground truth labels.
	normalize (`boolean`): If set to False, returns the number of correctly classified samples. Otherwise, returns the fraction of correctly classified samples. Defaults to True.
	sample_weight (`list` of `float`): Sample weights Defaults to None.
	Returns:
	accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
	Examples:
	Example 1-A simple example
	>>> accuracy_metric = evaluate.load("accuracy")
	>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
	>>> print(results)
	{'accuracy': 0.5}
	Example 2-The same as Example 1, except with `normalize` set to `False`.
	>>> accuracy_metric = evaluate.load("accuracy")
	>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], normalize=False)
	>>> print(results)
	{'accuracy': 3.0}
	Example 3-The same as Example 1, except with `sample_weight` set.
	>>> accuracy_metric = evaluate.load("accuracy")
	>>> results = accuracy_metric.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0], sample_weight=[0.5, 2, 0.7, 0.5, 9, 0.4])
	>>> print(results)
	{'accuracy': 0.8778625954198473}
	"""


	_CITATION = """
	@article{scikit-learn,
	title={Scikit-learn: Machine Learning in {P}ython},
	author={Pedregosa, F. and Varoquaux, G. and Gramfort, A. and Michel, V.
	and Thirion, B. and Grisel, O. and Blondel, M. and Prettenhofer, P.
	and Weiss, R. and Dubourg, V. and Vanderplas, J. and Passos, A. and
	Cournapeau, D. and Brucher, M. and Perrot, M. and Duchesnay, E.},
	journal={Journal of Machine Learning Research},
	volume={12},
	pages={2825--2830},
	year={2011}
	}
	"""


	@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
	class ibleu(evaluate.Metric):
	def _info(self):
	if version.parse(scb.__version__) < version.parse("1.4.12"):
	raise ImportWarning(
	"To use `sacrebleu`, the module `sacrebleu>=1.4.12` is required, and the current version of `sacrebleu` doesn't match this condition.\n"
	'You can install it with `pip install "sacrebleu>=1.4.12"`.'
	)
	return evaluate.MetricInfo(
	description=_DESCRIPTION,
	citation=_CITATION,
	inputs_description=_KWARGS_DESCRIPTION,
	features=[
	datasets.Features(
	{
	"inputs": datasets.Value("string", id="sequence"),
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Sequence(
	datasets.Value("string", id="sequence"), id="references"
	),
	}
	),
	datasets.Features(
	{
	"inputs": datasets.Value("string", id="sequence"),
	"predictions": datasets.Value("string", id="sequence"),
	"references": datasets.Value("string", id="sequence"),
	}
	),
	],
	reference_urls=[
	"https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html"
	],
	)

	def _compute(
	self,
	inputs,
	predictions,
	references,
	alpha=0.7,
	smooth_method="exp",
	smooth_value=None,
	force=False,
	lowercase=False,
	tokenize=None,
	use_effective_order=False,
	):
	# if only one reference is provided make sure we still use list of lists
	if isinstance(references[0], str):
	references = [[ref] for ref in references]
	# we need to do the same for inputs
	if isinstance(inputs[0], str):
	inputs = [[inp] for inp in inputs]
	else:
	raise ValueError("There can be only one input string")

	references_per_prediction = len(references[0])
	if any(len(refs) != references_per_prediction for refs in references):
	raise ValueError("Sacrebleu requires the same number of references for each prediction")
	transformed_references = [[refs[i] for refs in references] for i in range(references_per_prediction)]

	tgt_bleu = scb.corpus_bleu(
	predictions,
	transformed_references,
	smooth_method=smooth_method,
	smooth_value=smooth_value,
	force=force,
	lowercase=lowercase,
	use_effective_order=use_effective_order,
	**(dict(tokenize=tokenize) if tokenize else {}),
	).score
	self_bleu = scb.corpus_bleu(
	predictions,
	inputs,
	smooth_method=smooth_method,
	smooth_value=smooth_value,
	force=force,
	lowercase=lowercase,
	use_effective_order=use_effective_order,
	**(dict(tokenize=tokenize) if tokenize else {}),
	).score
	output_dict = {
	"score": alpha * tgt_bleu - (1 - alpha) * self_bleu
	}
	return output_dict