Spaces:

CZLC
/

rouge_raw

Runtime error

App Files Files Community

Martin Dočekal commited on Apr 2, 2024

Commit

6aed907

1 Parent(s): 8f4e42e

bootstrapping, return dict keys change

Browse files

Files changed (1) hide show

rouge_raw.py +185 -34

rouge_raw.py CHANGED Viewed

@@ -35,12 +35,127 @@ Module for raw ROUGE score calculation from:
 :author:     Martin Dočekal
 """
 import re
 from typing import Sequence, Optional
 import datasets
 import evaluate
 class RougeRawOriginal:
@@ -51,6 +166,7 @@ class RougeRawOriginal:
     class FScore:
         """F1 score representation."""
         def __init__(self, correct, gold, system):
             self.p = correct / system if system else 0.
             self.r = correct / gold if gold else 0.
@@ -58,6 +174,7 @@ class RougeRawOriginal:
     def _rouge_n(self, n, gold_words, system_words):
         """Compute Rouge-n for given words."""
         def n_grams(n, words):
             ngrams = {}
             total = 0
@@ -108,27 +225,56 @@ class RougeRawOriginal:
             "L": self._rouge_l(lc_gold_words, lc_system_words),
         }
-    def corpus(self, gold, system):
         """Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora.
         Each corpus should be a collection of documents, each document a string.
         """
         assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments"
         assert len(gold) == len(system), "Given corpora should be of the same length"
-        rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]}
         if len(gold):
             for gold_document, system_document in zip(gold, system):
                 for key, value in self.document(gold_document, system_document).items():
-                    rouge[key].p += value.p
-                    rouge[key].r += value.r
-                    rouge[key].f += value.f
-            for key in rouge:
-                rouge[key].p /= len(gold)
-                rouge[key].r /= len(gold)
-                rouge[key].f /= len(gold)
         return rouge
@@ -178,15 +324,18 @@ Args:
     select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'.
         If None, all metrics are returned as a dictionary.
 Returns:
-    rougeraw1_precision
-    rougeraw1_recall
-    rougeraw1_fmeasure
-    rougeraw2_precision
-    rougeraw2_recall
-    rougeraw2_fmeasure
-    rougerawl_precision
-    rougerawl_recall
-    rougerawl_fmeasure
 Examples:
     >>> rougeraw = evaluate.load('CZLC/rouge_raw')
     >>> predictions = ["the cat is on the mat", "hello there"]
@@ -217,21 +366,23 @@ class RougeRaw(evaluate.Metric):
             ],
         )
-    def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None):
-        res = RougeRawOriginal().corpus(references, predictions)
-        res = {
-            "rougeraw1_precision": res["1"].p,
-            "rougeraw1_recall": res["1"].r,
-            "rougeraw1_fmeasure": res["1"].f,
-            "rougeraw2_precision": res["2"].p,
-            "rougeraw2_recall": res["2"].r,
-            "rougeraw2_fmeasure": res["2"].f,
-            "rougerawl_precision": res["L"].p,
-            "rougerawl_recall": res["L"].r,
-            "rougerawl_fmeasure": res["L"].f,
-        }
         if select is not None:
             return res[select]
         return res

 :author:     Martin Dočekal
 """
+import collections
 import re
 from typing import Sequence, Optional
 import datasets
 import evaluate
+import numpy as np
+class AggregateScore(collections.namedtuple("AggregateScore", ["low", "mid", "high"])):
+    """
+    Tuple containing confidence intervals for scores.
+    Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py
+    """
+class Score(
+    collections.namedtuple("Score", ["precision", "recall", "fmeasure"])):
+  """Tuple containing precision, recall, and f-measure values."""
+class BootstrapAggregator(object):
+    """Aggregates scores to provide confidence intervals.
+    Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py
+  Sample usage:
+    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
+    aggregator = Aggregator()
+    aggregator.add_scores(scorer.score("one two three", "one two"))
+    aggregator.add_scores(scorer.score("one two five six", "seven eight"))
+    result = aggregator.aggregate()
+    print result
+    {'rougeL': AggregateScore(
+         low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
+         mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
+         high=Score(precision=1.0, recall=0.66, fmeasure=0.80)),
+     'rouge1': AggregateScore(
+         low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
+         mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
+         high=Score(precision=1.0, recall=0.66, fmeasure=0.80))}
+  """
+    def __init__(self, confidence_interval=0.95, n_samples=1000):
+        """Initializes a BootstrapAggregator object.
+    Args:
+      confidence_interval: Confidence interval to compute on the mean as a
+        decimal.
+      n_samples: Number of samples to use for bootstrap resampling.
+    Raises:
+      ValueError: If invalid argument is given.
+    """
+        if confidence_interval < 0 or confidence_interval > 1:
+            raise ValueError("confidence_interval must be in range [0, 1]")
+        if n_samples <= 0:
+            raise ValueError("n_samples must be positive")
+        self._n_samples = n_samples
+        self._confidence_interval = confidence_interval
+        self._scores = collections.defaultdict(list)
+    def add_scores(self, scores):
+        """Adds a sample for future aggregation.
+    Args:
+      scores: Dict mapping score_type strings to a namedtuple object/class
+        representing a score.
+    """
+        for score_type, score in scores.items():
+            self._scores[score_type].append(score)
+    def aggregate(self):
+        """Aggregates scores previously added using add_scores.
+    Returns:
+      A dict mapping score_type to AggregateScore objects.
+    """
+        result = {}
+        for score_type, scores in self._scores.items():
+            # Stack scores into a 2-d matrix of (sample, measure).
+            score_matrix = np.vstack(tuple(scores))
+            # Percentiles are returned as (interval, measure).
+            percentiles = self._bootstrap_resample(score_matrix)
+            # Extract the three intervals (low, mid, high).
+            intervals = tuple(
+                (scores[0].__class__(*percentiles[j, :]) for j in range(3)))
+            result[score_type] = AggregateScore(
+                low=intervals[0], mid=intervals[1], high=intervals[2])
+        return result
+    def _bootstrap_resample(self, matrix):
+        """Performs bootstrap resampling on a matrix of scores.
+    Args:
+      matrix: A 2-d matrix of (sample, measure).
+    Returns:
+      A 2-d matrix of (bounds, measure). There are three bounds: low (row 0),
+      mid (row 1) and high (row 2). Mid is always the mean, while low and high
+      bounds are specified by self._confidence_interval (which defaults to 0.95
+      meaning it will return the 2.5th and 97.5th percentiles for a 95%
+      confidence interval on the mean).
+    """
+        # Matrix of (bootstrap sample, measure).
+        sample_mean = np.zeros((self._n_samples, matrix.shape[1]))
+        for i in range(self._n_samples):
+            sample_idx = np.random.choice(
+                np.arange(matrix.shape[0]), size=matrix.shape[0])
+            sample = matrix[sample_idx, :]
+            sample_mean[i, :] = np.mean(sample, axis=0)
+        # Take percentiles on the estimate of the mean using bootstrap samples.
+        # Final result is a (bounds, measure) matrix.
+        percentile_delta = (1 - self._confidence_interval) / 2
+        q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta])
+        return np.percentile(sample_mean, q, axis=0)
 class RougeRawOriginal:
     class FScore:
         """F1 score representation."""
         def __init__(self, correct, gold, system):
             self.p = correct / system if system else 0.
             self.r = correct / gold if gold else 0.
     def _rouge_n(self, n, gold_words, system_words):
         """Compute Rouge-n for given words."""
         def n_grams(n, words):
             ngrams = {}
             total = 0
             "L": self._rouge_l(lc_gold_words, lc_system_words),
         }
+    def corpus(self, gold, system, aggregate=True):
         """Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora.
         Each corpus should be a collection of documents, each document a string.
+        If aggregate is True, the lower, mid, and upper bounds of the confidence interval are returned.
         """
         assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments"
         assert len(gold) == len(system), "Given corpora should be of the same length"
+        if aggregate:
+            aggregator = BootstrapAggregator()
+        else:
+            rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]}
         if len(gold):
             for gold_document, system_document in zip(gold, system):
                 for key, value in self.document(gold_document, system_document).items():
+                    if aggregate:
+                        aggregator.add_scores({
+                            key: Score(precision=value.p, recall=value.r, fmeasure=value.f)
+                        })
+                    else:
+                        rouge[key].p += value.p
+                        rouge[key].r += value.r
+                        rouge[key].f += value.f
+            if not aggregate:
+                for key in rouge:
+                    rouge[key].p /= len(gold)
+                    rouge[key].r /= len(gold)
+                    rouge[key].f /= len(gold)
+        if aggregate:
+            rouge = {}
+            # convert the named tuple to a dict
+            for k, ag_score in aggregator.aggregate().items():
+                rouge[k + "_low_precision"] = float(ag_score.low.precision)
+                rouge[k + "_low_recall"] = float(ag_score.low.recall)
+                rouge[k + "_low_fmeasure"] = float(ag_score.low.fmeasure)
+                rouge[k + "_mid_precision"] = float(ag_score.mid.precision)
+                rouge[k + "_mid_recall"] = float(ag_score.mid.recall)
+                rouge[k + "_mid_fmeasure"] = float(ag_score.mid.fmeasure)
+                rouge[k + "_high_precision"] = float(ag_score.high.precision)
+                rouge[k + "_high_recall"] = float(ag_score.high.recall)
+                rouge[k + "_high_fmeasure"] = float(ag_score.high.fmeasure)
         return rouge
     select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'.
         If None, all metrics are returned as a dictionary.
 Returns:
+    1_precision
+    1_recall
+    1_fmeasure
+    2_precision
+    2_recall
+    2_fmeasure
+    l_precision
+    l_recall
+    l_fmeasure
+    if aggregate is True there are also low, mid and high values for each metric. Thus, e.g.:
+        1_low_precision
 Examples:
     >>> rougeraw = evaluate.load('CZLC/rouge_raw')
     >>> predictions = ["the cat is on the mat", "hello there"]
             ],
         )
+    def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None,
+                 aggregate: bool = True):
+        res = RougeRawOriginal().corpus(references, predictions, aggregate=aggregate)
+        if not aggregate:
+            res = {
+                "1_precision": res["1"].p,
+                "1_recall": res["1"].r,
+                "1_fmeasure": res["1"].f,
+                "2_precision": res["2"].p,
+                "2_recall": res["2"].r,
+                "2_fmeasure": res["2"].f,
+                "l_precision": res["L"].p,
+                "l_recall": res["L"].r,
+                "l_fmeasure": res["L"].f,
+            }
         if select is not None:
             return res[select]
         return res