Spaces:
Runtime error
Runtime error
Kaleidophon
commited on
Commit
·
d2a50fa
1
Parent(s):
296c6ac
Add all files and wrapper around test function.
Browse files- README.md +60 -2
- almost_stochastic_order.py +115 -0
- app.py +6 -0
- requirements.txt +1 -0
README.md
CHANGED
@@ -1,12 +1,70 @@
|
|
1 |
---
|
2 |
title: Almost Stochastic Order
|
3 |
-
emoji:
|
4 |
colorFrom: green
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.13.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
|
|
|
|
|
|
|
|
|
|
10 |
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
title: Almost Stochastic Order
|
3 |
+
emoji: ⚖️
|
4 |
colorFrom: green
|
5 |
colorTo: yellow
|
6 |
sdk: gradio
|
7 |
sdk_version: 3.13.0
|
8 |
app_file: app.py
|
9 |
pinned: false
|
10 |
+
tags:
|
11 |
+
- evaluate
|
12 |
+
- comparison
|
13 |
+
description: >-
|
14 |
+
Wilcoxon's test is a signed-rank test for comparing paired samples.
|
15 |
---
|
16 |
|
17 |
+
# Comparison Card for Almost Stochastic Order
|
18 |
+
|
19 |
+
## Comparison description
|
20 |
+
|
21 |
+
Wilcoxon's test is a non-parametric signed-rank test that tests whether the distribution of the differences is symmetric about zero. It can be used to compare the predictions of two models.
|
22 |
+
|
23 |
+
## How to use
|
24 |
+
|
25 |
+
The Almost Stochastic Order comparison is used to analyze any kind of real-valued data.
|
26 |
+
|
27 |
+
## Inputs
|
28 |
+
|
29 |
+
Its arguments are:
|
30 |
+
|
31 |
+
`predictions1`: a list of predictions from the first model.
|
32 |
+
|
33 |
+
`predictions2`: a list of predictions from the second model.
|
34 |
+
|
35 |
+
## Output values
|
36 |
+
|
37 |
+
The Wilcoxon comparison outputs two things:
|
38 |
+
|
39 |
+
`stat`: The Wilcoxon statistic.
|
40 |
+
|
41 |
+
`p`: The p value.
|
42 |
+
|
43 |
+
## Examples
|
44 |
+
|
45 |
+
Example comparison:
|
46 |
+
|
47 |
+
```python
|
48 |
+
aso = evaluate.load("almost_stochastic_order")
|
49 |
+
results = aso.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
|
50 |
+
print(results)
|
51 |
+
{'stat': 5.0, 'p': 0.625}
|
52 |
+
```
|
53 |
+
|
54 |
+
## Limitations and bias
|
55 |
+
|
56 |
+
The Wilcoxon test is a non-parametric test, so it has relatively few assumptions (basically only that the observations are independent). It should be used to analyze paired ordinal data only.
|
57 |
+
|
58 |
+
## Citations
|
59 |
+
|
60 |
+
```bibtex
|
61 |
+
@incollection{wilcoxon1992individual,
|
62 |
+
title={Individual comparisons by ranking methods},
|
63 |
+
author={Wilcoxon, Frank},
|
64 |
+
booktitle={Breakthroughs in statistics},
|
65 |
+
pages={196--202},
|
66 |
+
year={1992},
|
67 |
+
publisher={Springer}
|
68 |
+
}
|
69 |
+
```
|
70 |
+
|
almost_stochastic_order.py
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright 2022 The HuggingFace Evaluate Authors
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
"""Almost Stochastic Order test for model comparison."""
|
15 |
+
|
16 |
+
from typing import Optional
|
17 |
+
|
18 |
+
import datasets
|
19 |
+
from deepsig import aso
|
20 |
+
|
21 |
+
import evaluate
|
22 |
+
|
23 |
+
|
24 |
+
_DESCRIPTION = """
|
25 |
+
The Almost Stochastic Order test is a non-parametric test that tests to what extent the distributions of predictions differ from each other through measuring their Wasserstein distance. It can be used to compare the predictions of two models.
|
26 |
+
"""
|
27 |
+
|
28 |
+
|
29 |
+
_KWARGS_DESCRIPTION = """
|
30 |
+
Args:
|
31 |
+
predictions1 (`list` of `float`): Predictions for model 1.
|
32 |
+
predictions2 (`list` of `float`): Predictions for model 2.
|
33 |
+
Kwargs:
|
34 |
+
confidence_level (`float`): Confidence level under which the result is obtained. Default is 0.95.
|
35 |
+
num_bootstrap_iterations: (`int`): Number of bootstrap iterations to compute upper bound to test statistics. Default is 1000.
|
36 |
+
dt (`float`): Differential for t during numerical integral calculation. Default is 0.005.
|
37 |
+
num_jobs (`int` or None): Number of jobs to use for test. If None, this defaults to value specified in the num_process attribute.
|
38 |
+
show_progress (`bool`): If True, a progress bar is shown when computing the test statistic. Default is False.
|
39 |
+
Returns:
|
40 |
+
violation_ratio (`float`): (Frequentist upper bound to) Degree of violation of the stochastic order. When it is smaller than 0.5, the model producing predictions1 performs better than the other model at a confidence level specified by confidence_level argument (default is 0.95). Ulmer et al. (2022) recommend to reject the null hypothesis when violation_ratio is under 0.2.
|
41 |
+
Examples:
|
42 |
+
>>> aso = evaluate.load("almost_stochastic_order")
|
43 |
+
>>> results = aso.compute(predictions1=[-7, 123.45, 43, 4.91, 5], predictions2=[1337.12, -9.74, 1, 2, 3.21])
|
44 |
+
>>> print(results)
|
45 |
+
{'violation_ratio': }
|
46 |
+
"""
|
47 |
+
|
48 |
+
|
49 |
+
_CITATION = """
|
50 |
+
@article{ulmer2022deep,
|
51 |
+
title={deep-significance-Easy and Meaningful Statistical Significance Testing in the Age of Neural Networks},
|
52 |
+
author={Ulmer, Dennis and Hardmeier, Christian and Frellsen, Jes},
|
53 |
+
journal={arXiv preprint arXiv:2204.06815},
|
54 |
+
year={2022}
|
55 |
+
}
|
56 |
+
@inproceedings{dror2019deep,
|
57 |
+
author = {Rotem Dror and
|
58 |
+
Segev Shlomov and
|
59 |
+
Roi Reichart},
|
60 |
+
editor = {Anna Korhonen and
|
61 |
+
David R. Traum and
|
62 |
+
Llu{\'{\i}}s M{\`{a}}rquez},
|
63 |
+
title = {Deep Dominance - How to Properly Compare Deep Neural Models},
|
64 |
+
booktitle = {Proceedings of the 57th Conference of the Association for Computational
|
65 |
+
Linguistics, {ACL} 2019, Florence, Italy, July 28-August 2, 2019,
|
66 |
+
Volume 1: Long Papers},
|
67 |
+
pages = {2773--2785},
|
68 |
+
publisher = {Association for Computational Linguistics},
|
69 |
+
year = {2019}
|
70 |
+
}
|
71 |
+
"""
|
72 |
+
|
73 |
+
|
74 |
+
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
|
75 |
+
class AlmostStochasticOrder(evaluate.Comparison):
|
76 |
+
def _info(self):
|
77 |
+
return evaluate.ComparisonInfo(
|
78 |
+
module_type="comparison",
|
79 |
+
description=_DESCRIPTION,
|
80 |
+
citation=_CITATION,
|
81 |
+
inputs_description=_KWARGS_DESCRIPTION,
|
82 |
+
features=datasets.Features(
|
83 |
+
{
|
84 |
+
"predictions1": datasets.Value("float"),
|
85 |
+
"predictions2": datasets.Value("float"),
|
86 |
+
}
|
87 |
+
),
|
88 |
+
)
|
89 |
+
|
90 |
+
def _compute(
|
91 |
+
self, predictions1, predictions2,
|
92 |
+
confidence_level: float = 0.95,
|
93 |
+
num_bootstrap_iterations: int = 1000,
|
94 |
+
dt: float = 0.005,
|
95 |
+
num_jobs: Optional[int] = None,
|
96 |
+
show_progress: bool = False,
|
97 |
+
**kwargs
|
98 |
+
):
|
99 |
+
# Set number of jobs
|
100 |
+
if num_jobs is None:
|
101 |
+
num_jobs = self.num_process
|
102 |
+
|
103 |
+
else:
|
104 |
+
num_jobs = num_jobs
|
105 |
+
|
106 |
+
# Compute statistic
|
107 |
+
violation_ratio = aso(
|
108 |
+
scores_a=predictions1, scores_b=predictions2,
|
109 |
+
num_bootstrap_iterations=num_bootstrap_iterations,
|
110 |
+
dt=dt,
|
111 |
+
num_jobs=num_jobs,
|
112 |
+
seed=self.seed,
|
113 |
+
show_progress=show_progress
|
114 |
+
)
|
115 |
+
return {"violation_ratio": violation_ratio}
|
app.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import evaluate
|
2 |
+
from evaluate.utils import launch_gradio_widget
|
3 |
+
|
4 |
+
|
5 |
+
module = evaluate.load("almost_stochastic_order", module_type="comparison")
|
6 |
+
launch_gradio_widget(module)
|
requirements.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
deepsig
|