djstrong commited on
Commit
39d6a74
·
1 Parent(s): 96fbe7c

update description

Browse files
Files changed (1) hide show
  1. src/about.py +7 -7
src/about.py CHANGED
@@ -33,10 +33,10 @@ class Tasks(Enum):
33
  task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
34
  task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
35
  task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
36
- task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
37
  task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
38
  task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
39
  task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
 
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
42
  # ---------------------------------------------------
@@ -58,9 +58,11 @@ TITLE = """
58
  INTRODUCTION_TEXT = """
59
  The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
60
 
61
- Almost every task has two versions: regex and multiple choice. The regex version is scored based on exact match, while the multiple choice version is scored based on accuracy.
62
  * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
63
  * _mc suffix means that a model is scored against every possible class (suitable also for base models)
 
 
64
  """
65
 
66
  # Which evaluations are you running? how can people reproduce what you have?
@@ -75,11 +77,9 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
75
 
76
  * fix long model names
77
  * add inference time
78
- * add metadata for models (e.g. #Params)
79
  * add more tasks
80
  * use model templates
81
  * fix scrolling on Firefox
82
- * polish_poleval2018_task3_test_10k - IN PROGRESS
83
 
84
  ## Tasks
85
 
@@ -103,10 +103,10 @@ or join our [Discord SpeakLeash](https://discord.gg/3G9DVM39)
103
  | cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
104
  | klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
105
  | klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
 
 
 
106
  | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
107
- | polqa_reranking_mc | ipipan/polqa | accuracy | other |
108
- | polqa_open_book_g | ipipan/polqa | levenshtein | other |
109
- | polqa_closed_book_g | ipipan/polqa | levenshtein | other |
110
 
111
  ## Reproducibility
112
  To reproduce our results, you need to clone the repository:
 
33
  task17 = Task("polish_cbd_regex", "f1,score-first", "cbd_g", "generate_until", 0.149)
34
  task18 = Task("polish_klej_ner_multiple_choice", "acc,none", "klej_ner_mc", "multiple_choice", 0.343)
35
  task19 = Task("polish_klej_ner_regex", "exact_match,score-first", "klej_ner_g", "generate_until", 0.343)
 
36
  task21 = Task("polish_polqa_reranking_multiple_choice", "acc,none", "polqa_reranking_mc", "multiple_choice", 0.5335588952710677) # multiple_choice
37
  task22 = Task("polish_polqa_open_book", "levenshtein,none", "polqa_open_book_g", "generate_until", 0.0) # generate_until
38
  task23 = Task("polish_polqa_closed_book", "levenshtein,none", "polqa_closed_book_g", "generate_until", 0.0) # generate_until
39
+ task20 = Task("polish_poleval2018_task3_test_10k", "word_perplexity,none", "poleval2018_task3_test_10k", "other")
40
 
41
  NUM_FEWSHOT = 0 # Change with your few shot
42
  # ---------------------------------------------------
 
58
  INTRODUCTION_TEXT = """
59
  The leaderboard evaluates language models on a set of Polish tasks. The tasks are designed to test the models' ability to understand and generate Polish text. The leaderboard is designed to be a benchmark for the Polish language model community, and to help researchers and practitioners understand the capabilities of different models.
60
 
61
+ Almost every task has two versions: regex and multiple choice.
62
  * _g suffix means that a model needs to generate an answer (only suitable for instructions-based models)
63
  * _mc suffix means that a model is scored against every possible class (suitable also for base models)
64
+
65
+ Average columns are normalized against scores by "Baseline (majority class)".
66
  """
67
 
68
  # Which evaluations are you running? how can people reproduce what you have?
 
77
 
78
  * fix long model names
79
  * add inference time
 
80
  * add more tasks
81
  * use model templates
82
  * fix scrolling on Firefox
 
83
 
84
  ## Tasks
85
 
 
103
  | cbd_g | ptaszynski/PolishCyberbullyingDataset | macro F1 | generate_until |
104
  | klej_ner_mc | allegro/klej-nkjp-ner | accuracy | multiple_choice |
105
  | klej_ner_g | allegro/klej-nkjp-ner | accuracy | generate_until |
106
+ | polqa_reranking_mc | ipipan/polqa | accuracy | multiple_choice |
107
+ | polqa_open_book_g | ipipan/polqa | levenshtein | generate_until |
108
+ | polqa_closed_book_g | ipipan/polqa | levenshtein | generate_until |
109
  | poleval2018_task3_test_10k | enelpol/poleval2018_task3_test_10k | word perplexity | other |
 
 
 
110
 
111
  ## Reproducibility
112
  To reproduce our results, you need to clone the repository: