Martin Dočekal commited on
Commit
6aed907
·
1 Parent(s): 8f4e42e

bootstrapping, return dict keys change

Browse files
Files changed (1) hide show
  1. rouge_raw.py +185 -34
rouge_raw.py CHANGED
@@ -35,12 +35,127 @@ Module for raw ROUGE score calculation from:
35
 
36
  :author: Martin Dočekal
37
  """
38
-
39
  import re
40
  from typing import Sequence, Optional
41
 
42
  import datasets
43
  import evaluate
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
 
46
  class RougeRawOriginal:
@@ -51,6 +166,7 @@ class RougeRawOriginal:
51
 
52
  class FScore:
53
  """F1 score representation."""
 
54
  def __init__(self, correct, gold, system):
55
  self.p = correct / system if system else 0.
56
  self.r = correct / gold if gold else 0.
@@ -58,6 +174,7 @@ class RougeRawOriginal:
58
 
59
  def _rouge_n(self, n, gold_words, system_words):
60
  """Compute Rouge-n for given words."""
 
61
  def n_grams(n, words):
62
  ngrams = {}
63
  total = 0
@@ -108,27 +225,56 @@ class RougeRawOriginal:
108
  "L": self._rouge_l(lc_gold_words, lc_system_words),
109
  }
110
 
111
- def corpus(self, gold, system):
112
  """Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora.
113
  Each corpus should be a collection of documents, each document a string.
 
 
114
  """
115
 
116
  assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments"
117
  assert len(gold) == len(system), "Given corpora should be of the same length"
118
 
119
- rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]}
 
 
 
 
120
 
121
  if len(gold):
122
  for gold_document, system_document in zip(gold, system):
123
  for key, value in self.document(gold_document, system_document).items():
124
- rouge[key].p += value.p
125
- rouge[key].r += value.r
126
- rouge[key].f += value.f
127
-
128
- for key in rouge:
129
- rouge[key].p /= len(gold)
130
- rouge[key].r /= len(gold)
131
- rouge[key].f /= len(gold)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
133
  return rouge
134
 
@@ -178,15 +324,18 @@ Args:
178
  select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'.
179
  If None, all metrics are returned as a dictionary.
180
  Returns:
181
- rougeraw1_precision
182
- rougeraw1_recall
183
- rougeraw1_fmeasure
184
- rougeraw2_precision
185
- rougeraw2_recall
186
- rougeraw2_fmeasure
187
- rougerawl_precision
188
- rougerawl_recall
189
- rougerawl_fmeasure
 
 
 
190
  Examples:
191
  >>> rougeraw = evaluate.load('CZLC/rouge_raw')
192
  >>> predictions = ["the cat is on the mat", "hello there"]
@@ -217,21 +366,23 @@ class RougeRaw(evaluate.Metric):
217
  ],
218
  )
219
 
220
- def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None):
221
- res = RougeRawOriginal().corpus(references, predictions)
222
- res = {
223
- "rougeraw1_precision": res["1"].p,
224
- "rougeraw1_recall": res["1"].r,
225
- "rougeraw1_fmeasure": res["1"].f,
226
- "rougeraw2_precision": res["2"].p,
227
- "rougeraw2_recall": res["2"].r,
228
- "rougeraw2_fmeasure": res["2"].f,
229
- "rougerawl_precision": res["L"].p,
230
- "rougerawl_recall": res["L"].r,
231
- "rougerawl_fmeasure": res["L"].f,
232
- }
 
 
 
233
 
234
  if select is not None:
235
  return res[select]
236
  return res
237
-
 
35
 
36
  :author: Martin Dočekal
37
  """
38
+ import collections
39
  import re
40
  from typing import Sequence, Optional
41
 
42
  import datasets
43
  import evaluate
44
+ import numpy as np
45
+
46
+
47
+ class AggregateScore(collections.namedtuple("AggregateScore", ["low", "mid", "high"])):
48
+ """
49
+ Tuple containing confidence intervals for scores.
50
+ Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py
51
+ """
52
+
53
+
54
+ class Score(
55
+ collections.namedtuple("Score", ["precision", "recall", "fmeasure"])):
56
+ """Tuple containing precision, recall, and f-measure values."""
57
+
58
+
59
+ class BootstrapAggregator(object):
60
+ """Aggregates scores to provide confidence intervals.
61
+ Taken from: https://github.com/google-research/google-research/blob/master/rouge/scoring.py
62
+
63
+ Sample usage:
64
+ scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'])
65
+ aggregator = Aggregator()
66
+ aggregator.add_scores(scorer.score("one two three", "one two"))
67
+ aggregator.add_scores(scorer.score("one two five six", "seven eight"))
68
+ result = aggregator.aggregate()
69
+ print result
70
+ {'rougeL': AggregateScore(
71
+ low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
72
+ mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
73
+ high=Score(precision=1.0, recall=0.66, fmeasure=0.80)),
74
+ 'rouge1': AggregateScore(
75
+ low=Score(precision=0.0, recall=0.0, fmeasure=0.0),
76
+ mid=Score(precision=0.5, recall=0.33, fmeasure=0.40),
77
+ high=Score(precision=1.0, recall=0.66, fmeasure=0.80))}
78
+ """
79
+
80
+ def __init__(self, confidence_interval=0.95, n_samples=1000):
81
+ """Initializes a BootstrapAggregator object.
82
+
83
+ Args:
84
+ confidence_interval: Confidence interval to compute on the mean as a
85
+ decimal.
86
+ n_samples: Number of samples to use for bootstrap resampling.
87
+
88
+ Raises:
89
+ ValueError: If invalid argument is given.
90
+ """
91
+
92
+ if confidence_interval < 0 or confidence_interval > 1:
93
+ raise ValueError("confidence_interval must be in range [0, 1]")
94
+ if n_samples <= 0:
95
+ raise ValueError("n_samples must be positive")
96
+
97
+ self._n_samples = n_samples
98
+ self._confidence_interval = confidence_interval
99
+ self._scores = collections.defaultdict(list)
100
+
101
+ def add_scores(self, scores):
102
+ """Adds a sample for future aggregation.
103
+
104
+ Args:
105
+ scores: Dict mapping score_type strings to a namedtuple object/class
106
+ representing a score.
107
+ """
108
+
109
+ for score_type, score in scores.items():
110
+ self._scores[score_type].append(score)
111
+
112
+ def aggregate(self):
113
+ """Aggregates scores previously added using add_scores.
114
+
115
+ Returns:
116
+ A dict mapping score_type to AggregateScore objects.
117
+ """
118
+
119
+ result = {}
120
+ for score_type, scores in self._scores.items():
121
+ # Stack scores into a 2-d matrix of (sample, measure).
122
+ score_matrix = np.vstack(tuple(scores))
123
+ # Percentiles are returned as (interval, measure).
124
+ percentiles = self._bootstrap_resample(score_matrix)
125
+ # Extract the three intervals (low, mid, high).
126
+ intervals = tuple(
127
+ (scores[0].__class__(*percentiles[j, :]) for j in range(3)))
128
+ result[score_type] = AggregateScore(
129
+ low=intervals[0], mid=intervals[1], high=intervals[2])
130
+ return result
131
+
132
+ def _bootstrap_resample(self, matrix):
133
+ """Performs bootstrap resampling on a matrix of scores.
134
+
135
+ Args:
136
+ matrix: A 2-d matrix of (sample, measure).
137
+
138
+ Returns:
139
+ A 2-d matrix of (bounds, measure). There are three bounds: low (row 0),
140
+ mid (row 1) and high (row 2). Mid is always the mean, while low and high
141
+ bounds are specified by self._confidence_interval (which defaults to 0.95
142
+ meaning it will return the 2.5th and 97.5th percentiles for a 95%
143
+ confidence interval on the mean).
144
+ """
145
+
146
+ # Matrix of (bootstrap sample, measure).
147
+ sample_mean = np.zeros((self._n_samples, matrix.shape[1]))
148
+ for i in range(self._n_samples):
149
+ sample_idx = np.random.choice(
150
+ np.arange(matrix.shape[0]), size=matrix.shape[0])
151
+ sample = matrix[sample_idx, :]
152
+ sample_mean[i, :] = np.mean(sample, axis=0)
153
+
154
+ # Take percentiles on the estimate of the mean using bootstrap samples.
155
+ # Final result is a (bounds, measure) matrix.
156
+ percentile_delta = (1 - self._confidence_interval) / 2
157
+ q = 100 * np.array([percentile_delta, 0.5, 1 - percentile_delta])
158
+ return np.percentile(sample_mean, q, axis=0)
159
 
160
 
161
  class RougeRawOriginal:
 
166
 
167
  class FScore:
168
  """F1 score representation."""
169
+
170
  def __init__(self, correct, gold, system):
171
  self.p = correct / system if system else 0.
172
  self.r = correct / gold if gold else 0.
 
174
 
175
  def _rouge_n(self, n, gold_words, system_words):
176
  """Compute Rouge-n for given words."""
177
+
178
  def n_grams(n, words):
179
  ngrams = {}
180
  total = 0
 
225
  "L": self._rouge_l(lc_gold_words, lc_system_words),
226
  }
227
 
228
+ def corpus(self, gold, system, aggregate=True):
229
  """Compute RougeRAW-1, RougeRAW-2, RougeRAW-L for given corpora.
230
  Each corpus should be a collection of documents, each document a string.
231
+
232
+ If aggregate is True, the lower, mid, and upper bounds of the confidence interval are returned.
233
  """
234
 
235
  assert isinstance(gold, list) and isinstance(system, list), "Expected list arguments"
236
  assert len(gold) == len(system), "Given corpora should be of the same length"
237
 
238
+
239
+ if aggregate:
240
+ aggregator = BootstrapAggregator()
241
+ else:
242
+ rouge = {key: self.FScore(0, 0, 0) for key in ["1", "2", "L"]}
243
 
244
  if len(gold):
245
  for gold_document, system_document in zip(gold, system):
246
  for key, value in self.document(gold_document, system_document).items():
247
+ if aggregate:
248
+ aggregator.add_scores({
249
+ key: Score(precision=value.p, recall=value.r, fmeasure=value.f)
250
+ })
251
+ else:
252
+ rouge[key].p += value.p
253
+ rouge[key].r += value.r
254
+ rouge[key].f += value.f
255
+
256
+ if not aggregate:
257
+ for key in rouge:
258
+ rouge[key].p /= len(gold)
259
+ rouge[key].r /= len(gold)
260
+ rouge[key].f /= len(gold)
261
+
262
+ if aggregate:
263
+ rouge = {}
264
+ # convert the named tuple to a dict
265
+
266
+ for k, ag_score in aggregator.aggregate().items():
267
+ rouge[k + "_low_precision"] = float(ag_score.low.precision)
268
+ rouge[k + "_low_recall"] = float(ag_score.low.recall)
269
+ rouge[k + "_low_fmeasure"] = float(ag_score.low.fmeasure)
270
+
271
+ rouge[k + "_mid_precision"] = float(ag_score.mid.precision)
272
+ rouge[k + "_mid_recall"] = float(ag_score.mid.recall)
273
+ rouge[k + "_mid_fmeasure"] = float(ag_score.mid.fmeasure)
274
+
275
+ rouge[k + "_high_precision"] = float(ag_score.high.precision)
276
+ rouge[k + "_high_recall"] = float(ag_score.high.recall)
277
+ rouge[k + "_high_fmeasure"] = float(ag_score.high.fmeasure)
278
 
279
  return rouge
280
 
 
324
  select: (Optional) string. The name of the metric to return. One of: 'rougeraw1_precision', 'rougeraw1_recall', 'rougeraw1_fmeasure', 'rougeraw2_precision', 'rougeraw2_recall', 'rougeraw2_fmeasure', 'rougerawl_precision', 'rougerawl_recall', 'rougerawl_fmeasure'.
325
  If None, all metrics are returned as a dictionary.
326
  Returns:
327
+ 1_precision
328
+ 1_recall
329
+ 1_fmeasure
330
+ 2_precision
331
+ 2_recall
332
+ 2_fmeasure
333
+ l_precision
334
+ l_recall
335
+ l_fmeasure
336
+
337
+ if aggregate is True there are also low, mid and high values for each metric. Thus, e.g.:
338
+ 1_low_precision
339
  Examples:
340
  >>> rougeraw = evaluate.load('CZLC/rouge_raw')
341
  >>> predictions = ["the cat is on the mat", "hello there"]
 
366
  ],
367
  )
368
 
369
+ def _compute(self, predictions: Sequence[str], references: Sequence[str], select: Optional[str] = None,
370
+ aggregate: bool = True):
371
+ res = RougeRawOriginal().corpus(references, predictions, aggregate=aggregate)
372
+
373
+ if not aggregate:
374
+ res = {
375
+ "1_precision": res["1"].p,
376
+ "1_recall": res["1"].r,
377
+ "1_fmeasure": res["1"].f,
378
+ "2_precision": res["2"].p,
379
+ "2_recall": res["2"].r,
380
+ "2_fmeasure": res["2"].f,
381
+ "l_precision": res["L"].p,
382
+ "l_recall": res["L"].r,
383
+ "l_fmeasure": res["L"].f,
384
+ }
385
 
386
  if select is not None:
387
  return res[select]
388
  return res