Spaces:
Restarting
on
CPU Upgrade
Restarting
on
CPU Upgrade
update description
Browse files- src/about.py +7 -7
src/about.py
CHANGED
@@ -33,10 +33,10 @@ class Tasks(Enum):
|
|
33 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
34 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
35 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
|
36 |
-
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
37 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
38 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
39 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
|
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
42 |
# ---------------------------------------------------
|
@@ -58,9 +58,11 @@ TITLE = """
|
|
58 |
INTRODUCTION_TEXT = """
|
59 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
60 |
|
61 |
-
Almost every task has two versions: regex and multiple choice.
|
62 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
63 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
|
|
|
|
64 |
"""
|
65 |
|
66 |
# Which evaluations are you running? how can people reproduce what you have?
|
@@ -75,11 +77,9 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
|
|
75 |
|
76 |
* fix long model names
|
77 |
* add inference time
|
78 |
-
* add metadata for models (e.g. #Params)
|
79 |
* add more tasks
|
80 |
* use model templates
|
81 |
* fix scrolling on Firefox
|
82 |
-
* polish_poleval2018_task3_test_10k - IN PROGRESS
|
83 |
|
84 |
## Tasks
|
85 |
|
@@ -103,10 +103,10 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
|
|
103 |
| cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
|
104 |
| klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
|
105 |
| klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
|
|
|
|
|
|
|
106 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
107 |
-
| polqa_reranking_mc | ipipan/polqa | accuracy | other |
|
108 |
-
| polqa_open_book_g | ipipan/polqa | levenshtein | other |
|
109 |
-
| polqa_closed_book_g | ipipan/polqa | levenshtein | other |
|
110 |
|
111 |
## Reproducibility
|
112 |
To reproduce our results, you need to clone the repository:
|
|
|
33 |
task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
|
34 |
task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
|
35 |
task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
|
|
|
36 |
task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
|
37 |
task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
|
38 |
task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
|
39 |
+
task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
|
40 |
|
41 |
NUM_FEWSHOT = 0 # Change with your few shot
|
42 |
# ---------------------------------------------------
|
|
|
58 |
INTRODUCTION_TEXT = """
|
59 |
The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
|
60 |
|
61 |
+
Almost every task has two versions: regex and multiple choice.
|
62 |
* _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
|
63 |
* _mc suffix means that a model is scored against every possible class (suitable also for base models)
|
64 |
+
|
65 |
+
Average columns are normalized against scores by "Baseline (majority class)".
|
66 |
"""
|
67 |
|
68 |
# Which evaluations are you running? how can people reproduce what you have?
|
|
|
77 |
|
78 |
* fix long model names
|
79 |
* add inference time
|
|
|
80 |
* add more tasks
|
81 |
* use model templates
|
82 |
* fix scrolling on Firefox
|
|
|
83 |
|
84 |
## Tasks
|
85 |
|
|
|
103 |
| cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
|
104 |
| klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
|
105 |
| klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
|
106 |
+
| polqa_reranking_mc | ipipan/polqa | accuracy | multiple_choice |
|
107 |
+
| polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
|
108 |
+
| polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
|
109 |
| poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
|
|
|
|
|
|
|
110 |
|
111 |
## Reproducibility
|
112 |
To reproduce our results, you need to clone the repository:
|