HalteroXHunter commited on
Commit
1a07572
·
1 Parent(s): e368a57

include new metrics

Browse files
Files changed (1) hide show
  1. classification_evaluator.py +34 -15
classification_evaluator.py CHANGED
@@ -1,6 +1,7 @@
1
  import evaluate
2
  from datasets import Features, Value
3
- from sklearn.metrics import accuracy_score
 
4
 
5
  _CITATION = """
6
  @article{scikit-learn,
@@ -17,13 +18,11 @@ _CITATION = """
17
  """
18
 
19
  _DESCRIPTION = """
20
- Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
21
- Accuracy = (TP + TN) / (TP + TN + FP + FN)
22
- Where:
23
- TP: True positive
24
- TN: True negative
25
- FP: False positive
26
- FN: False negative
27
  """
28
 
29
  _KWARGS_DESCRIPTION = """
@@ -32,8 +31,12 @@ Args:
32
  references (`list` of `str`): Ground truth labels.
33
 
34
  Returns:
35
- accuracy (`float` or `int`): Accuracy score. Minimum possible value is 0. Maximum possible value is 1.0, or the number of examples input, if `normalize` is set to `True`.. A higher score means higher accuracy.
36
-
 
 
 
 
37
  """
38
 
39
 
@@ -50,10 +53,26 @@ class ClassificationEvaluator(evaluate.Metric):
50
 
51
  def _compute(self, predictions, references):
52
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  return {
54
- "accuracy": float(
55
- accuracy_score(
56
- references, predictions, normalize=True, sample_weight=None
57
- )
58
- )
 
 
 
59
  }
 
1
  import evaluate
2
  from datasets import Features, Value
3
+ from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
4
+
5
 
6
  _CITATION = """
7
  @article{scikit-learn,
 
18
  """
19
 
20
  _DESCRIPTION = """
21
+ This evaluator computes multiple classification metrics to assess the performance of a model. Metrics calculated include:
22
+ - Accuracy: The proportion of correct predictions among the total number of cases processed. Computed as (TP + TN) / (TP + TN + FP + FN), where TP, TN, FP, and FN denote true positives, true negatives, false positives, and false negatives respectively.
23
+ - Precision, Recall, and F1-Score: Evaluated for each class individually as well as macro (average across classes) and micro (aggregate contributions of all classes) averages.
24
+ - Confusion Matrix: A matrix representing the classification accuracy for each class combination.
25
+
 
 
26
  """
27
 
28
  _KWARGS_DESCRIPTION = """
 
31
  references (`list` of `str`): Ground truth labels.
32
 
33
  Returns:
34
+ Returns:
35
+ Dict containing:
36
+ accuracy (float): Proportion of correct predictions. Value ranges between 0 (worst) and 1 (best).
37
+ precision_macro (float), recall_macro (float), f1_macro (float): Macro averages of precision, recall, and F1-score respectively.
38
+ precision_micro (float), recall_micro (float), f1_micro (float): Micro averages of precision, recall, and F1-score respectively.
39
+ confusion_matrix (list of lists): 2D list representing the confusion matrix of the classification results.
40
  """
41
 
42
 
 
53
 
54
  def _compute(self, predictions, references):
55
 
56
+ accuracy = accuracy_score(references, predictions, normalize=True, sample_weight=None)
57
+
58
+ # Calculate macro and micro averages for precision, recall, and F1-score
59
+ precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
60
+ references, predictions, average='macro'
61
+ )
62
+ precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(
63
+ references, predictions, average='micro'
64
+ )
65
+
66
+ # Calculate the confusion matrix
67
+ conf_matrix = confusion_matrix(references, predictions)
68
+
69
  return {
70
+ "accuracy": accuracy,
71
+ "precision_macro": float(precision_macro),
72
+ "recall_macro": float(recall_macro),
73
+ "f1_macro": float(f1_macro),
74
+ "precision_micro": float(precision_micro),
75
+ "recall_micro": float(recall_micro),
76
+ "f1_micro": float(f1_micro),
77
+ "confusion_matrix": conf_matrix.tolist()
78
  }