diff --git a/4b284b12bc4/evaluation/4b284b12bc4_0.json b/4b284b12bc4/evaluation/4b284b12bc4_0.json new file mode 100644 index 0000000000000000000000000000000000000000..99f874d3a25c8315a5d493aed9776de54f8fc547 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.014933117490932575 + }, + "anli_r2": { + "acc": 0.334, + "acc_stderr": 0.014922019523732961 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077249 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.23306878306878312 + }, + "copa": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506 + }, + "hellaswag": { + "acc": 0.4695279824736108, + "acc_stderr": 0.0049805063294075845, + "acc_norm": 0.6132244572794264, + "acc_norm_stderr": 0.004860162076330956 + }, + "rte": { + "acc": 0.5812274368231047, + "acc_stderr": 0.02969666108123484 + }, + "winogrande": { + "acc": 0.5753749013417522, + "acc_stderr": 0.013891893150264218 + }, + "storycloze_2016": { + "acc": 0.711918760021379, + "acc_stderr": 0.010472537019822578 + }, + "boolq": { + "acc": 0.5464831804281346, + "acc_stderr": 0.008707182331111644 + }, + "arc_easy": { + "acc": 0.5538720538720538, + "acc_stderr": 0.01020005782876501, + "acc_norm": 0.4936868686868687, + "acc_norm_stderr": 0.01025896566804443 + }, + "arc_challenge": { + "acc": 0.2636518771331058, + "acc_stderr": 0.012875929151297049, + "acc_norm": 0.2883959044368601, + "acc_norm_stderr": 0.013238394422428175 + }, + "sciq": { + "acc": 0.82, + "acc_stderr": 0.012155153135511965, + "acc_norm": 0.749, + "acc_norm_stderr": 0.013718133516888921 + }, + "piqa": { + "acc": 0.73449401523395, + "acc_stderr": 0.010303308653024429, + "acc_norm": 0.7475516866158868, + "acc_norm_stderr": 0.010135665547362354 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..99f874d3a25c8315a5d493aed9776de54f8fc547 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_0_lm-eval_global_step80108_2023-01-30-11-23-34_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.014933117490932575 + }, + "anli_r2": { + "acc": 0.334, + "acc_stderr": 0.014922019523732961 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077249 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.23306878306878312 + }, + "copa": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506 + }, + "hellaswag": { + "acc": 0.4695279824736108, + "acc_stderr": 0.0049805063294075845, + "acc_norm": 0.6132244572794264, + "acc_norm_stderr": 0.004860162076330956 + }, + "rte": { + "acc": 0.5812274368231047, + "acc_stderr": 0.02969666108123484 + }, + "winogrande": { + "acc": 0.5753749013417522, + "acc_stderr": 0.013891893150264218 + }, + "storycloze_2016": { + "acc": 0.711918760021379, + "acc_stderr": 0.010472537019822578 + }, + "boolq": { + "acc": 0.5464831804281346, + "acc_stderr": 0.008707182331111644 + }, + "arc_easy": { + "acc": 0.5538720538720538, + "acc_stderr": 0.01020005782876501, + "acc_norm": 0.4936868686868687, + "acc_norm_stderr": 0.01025896566804443 + }, + "arc_challenge": { + "acc": 0.2636518771331058, + "acc_stderr": 0.012875929151297049, + "acc_norm": 0.2883959044368601, + "acc_norm_stderr": 0.013238394422428175 + }, + "sciq": { + "acc": 0.82, + "acc_stderr": 0.012155153135511965, + "acc_norm": 0.749, + "acc_norm_stderr": 0.013718133516888921 + }, + "piqa": { + "acc": 0.73449401523395, + "acc_stderr": 0.010303308653024429, + "acc_norm": 0.7475516866158868, + "acc_norm_stderr": 0.010135665547362354 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_1.json b/4b284b12bc4/evaluation/4b284b12bc4_1.json new file mode 100644 index 0000000000000000000000000000000000000000..766eff5aaab727fcee1b34569d0050e6bdf55b46 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.333, + "acc_stderr": 0.014910846164229868 + }, + "anli_r2": { + "acc": 0.326, + "acc_stderr": 0.01483050720454104 + }, + "anli_r3": { + "acc": 0.3475, + "acc_stderr": 0.013751753243291852 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.37227304714989445 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.47191794463254333, + "acc_stderr": 0.004981905293878145, + "acc_norm": 0.6139215295757817, + "acc_norm_stderr": 0.004858539527872466 + }, + "rte": { + "acc": 0.5703971119133574, + "acc_stderr": 0.029796668829124674 + }, + "winogrande": { + "acc": 0.5706393054459353, + "acc_stderr": 0.013911537499969163 + }, + "storycloze_2016": { + "acc": 0.7151256012827365, + "acc_stderr": 0.01043751398661172 + }, + "boolq": { + "acc": 0.5669724770642202, + "acc_stderr": 0.00866625130551806 + }, + "arc_easy": { + "acc": 0.5913299663299664, + "acc_stderr": 0.010087174498762883, + "acc_norm": 0.5496632996632996, + "acc_norm_stderr": 0.010209047724374145 + }, + "arc_challenge": { + "acc": 0.2627986348122867, + "acc_stderr": 0.012862523175351333, + "acc_norm": 0.30716723549488056, + "acc_norm_stderr": 0.013481034054980943 + }, + "sciq": { + "acc": 0.836, + "acc_stderr": 0.011715000693181331, + "acc_norm": 0.781, + "acc_norm_stderr": 0.013084731950262012 + }, + "piqa": { + "acc": 0.7448313384113167, + "acc_stderr": 0.010171571592521822, + "acc_norm": 0.7535364526659413, + "acc_norm_stderr": 0.01005481078967181 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..766eff5aaab727fcee1b34569d0050e6bdf55b46 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_1_lm-eval_global_step80108_2023-01-30-11-26-32_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.333, + "acc_stderr": 0.014910846164229868 + }, + "anli_r2": { + "acc": 0.326, + "acc_stderr": 0.01483050720454104 + }, + "anli_r3": { + "acc": 0.3475, + "acc_stderr": 0.013751753243291852 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.37227304714989445 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.47191794463254333, + "acc_stderr": 0.004981905293878145, + "acc_norm": 0.6139215295757817, + "acc_norm_stderr": 0.004858539527872466 + }, + "rte": { + "acc": 0.5703971119133574, + "acc_stderr": 0.029796668829124674 + }, + "winogrande": { + "acc": 0.5706393054459353, + "acc_stderr": 0.013911537499969163 + }, + "storycloze_2016": { + "acc": 0.7151256012827365, + "acc_stderr": 0.01043751398661172 + }, + "boolq": { + "acc": 0.5669724770642202, + "acc_stderr": 0.00866625130551806 + }, + "arc_easy": { + "acc": 0.5913299663299664, + "acc_stderr": 0.010087174498762883, + "acc_norm": 0.5496632996632996, + "acc_norm_stderr": 0.010209047724374145 + }, + "arc_challenge": { + "acc": 0.2627986348122867, + "acc_stderr": 0.012862523175351333, + "acc_norm": 0.30716723549488056, + "acc_norm_stderr": 0.013481034054980943 + }, + "sciq": { + "acc": 0.836, + "acc_stderr": 0.011715000693181331, + "acc_norm": 0.781, + "acc_norm_stderr": 0.013084731950262012 + }, + "piqa": { + "acc": 0.7448313384113167, + "acc_stderr": 0.010171571592521822, + "acc_norm": 0.7535364526659413, + "acc_norm_stderr": 0.01005481078967181 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_2.json b/4b284b12bc4/evaluation/4b284b12bc4_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4adaea156e9fa3d315781bc5dc84f4c8de4c462b --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.329, + "acc_stderr": 0.014865395385928354 + }, + "anli_r2": { + "acc": 0.336, + "acc_stderr": 0.014944140233795027 + }, + "anli_r3": { + "acc": 0.3383333333333333, + "acc_stderr": 0.013664144006618266 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.06737697508644648, + "f1": 0.3338011695906433 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4697271459868552, + "acc_stderr": 0.004980627287147585, + "acc_norm": 0.6141206930890261, + "acc_norm_stderr": 0.004858074013443988 + }, + "rte": { + "acc": 0.5523465703971119, + "acc_stderr": 0.02993107036293953 + }, + "winogrande": { + "acc": 0.574585635359116, + "acc_stderr": 0.013895257666646378 + }, + "storycloze_2016": { + "acc": 0.7156600748262961, + "acc_stderr": 0.010431614128665253 + }, + "boolq": { + "acc": 0.5660550458715596, + "acc_stderr": 0.008668405003744129 + }, + "arc_easy": { + "acc": 0.5993265993265994, + "acc_stderr": 0.01005530447425557, + "acc_norm": 0.5576599326599326, + "acc_norm_stderr": 0.01019133444422085 + }, + "arc_challenge": { + "acc": 0.2781569965870307, + "acc_stderr": 0.013094469919538805, + "acc_norm": 0.30887372013651876, + "acc_norm_stderr": 0.013501770929344003 + }, + "sciq": { + "acc": 0.835, + "acc_stderr": 0.011743632866916145, + "acc_norm": 0.79, + "acc_norm_stderr": 0.01288666233227453 + }, + "piqa": { + "acc": 0.7470076169749728, + "acc_stderr": 0.01014288869886246, + "acc_norm": 0.7519042437431991, + "acc_norm_stderr": 0.010077118315574706 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..4adaea156e9fa3d315781bc5dc84f4c8de4c462b --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_2_lm-eval_global_step80108_2023-01-30-11-26-32_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.329, + "acc_stderr": 0.014865395385928354 + }, + "anli_r2": { + "acc": 0.336, + "acc_stderr": 0.014944140233795027 + }, + "anli_r3": { + "acc": 0.3383333333333333, + "acc_stderr": 0.013664144006618266 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.06737697508644648, + "f1": 0.3338011695906433 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4697271459868552, + "acc_stderr": 0.004980627287147585, + "acc_norm": 0.6141206930890261, + "acc_norm_stderr": 0.004858074013443988 + }, + "rte": { + "acc": 0.5523465703971119, + "acc_stderr": 0.02993107036293953 + }, + "winogrande": { + "acc": 0.574585635359116, + "acc_stderr": 0.013895257666646378 + }, + "storycloze_2016": { + "acc": 0.7156600748262961, + "acc_stderr": 0.010431614128665253 + }, + "boolq": { + "acc": 0.5660550458715596, + "acc_stderr": 0.008668405003744129 + }, + "arc_easy": { + "acc": 0.5993265993265994, + "acc_stderr": 0.01005530447425557, + "acc_norm": 0.5576599326599326, + "acc_norm_stderr": 0.01019133444422085 + }, + "arc_challenge": { + "acc": 0.2781569965870307, + "acc_stderr": 0.013094469919538805, + "acc_norm": 0.30887372013651876, + "acc_norm_stderr": 0.013501770929344003 + }, + "sciq": { + "acc": 0.835, + "acc_stderr": 0.011743632866916145, + "acc_norm": 0.79, + "acc_norm_stderr": 0.01288666233227453 + }, + "piqa": { + "acc": 0.7470076169749728, + "acc_stderr": 0.01014288869886246, + "acc_norm": 0.7519042437431991, + "acc_norm_stderr": 0.010077118315574706 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_3.json b/4b284b12bc4/evaluation/4b284b12bc4_3.json new file mode 100644 index 0000000000000000000000000000000000000000..cfec1a1379ef3785a474c68e2d94a790aae2ea7e --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811485 + }, + "anli_r2": { + "acc": 0.334, + "acc_stderr": 0.014922019523732963 + }, + "anli_r3": { + "acc": 0.35, + "acc_stderr": 0.013774667009018554 + }, + "cb": { + "acc": 0.6071428571428571, + "acc_stderr": 0.0658538889806635, + "f1": 0.42400932400932395 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036622 + }, + "hellaswag": { + "acc": 0.47241585341565423, + "acc_stderr": 0.004982182323923561, + "acc_norm": 0.6199960167297351, + "acc_norm_stderr": 0.004843954338451449 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.030009848912529113 + }, + "winogrande": { + "acc": 0.5737963693764798, + "acc_stderr": 0.013898585965412338 + }, + "storycloze_2016": { + "acc": 0.7124532335649385, + "acc_stderr": 0.010466744473098363 + }, + "boolq": { + "acc": 0.5587155963302752, + "acc_stderr": 0.008684548127832637 + }, + "arc_easy": { + "acc": 0.5955387205387206, + "acc_stderr": 0.010070746648278783, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.010146568651002255 + }, + "arc_challenge": { + "acc": 0.2815699658703072, + "acc_stderr": 0.013143376735009022, + "acc_norm": 0.3122866894197952, + "acc_norm_stderr": 0.013542598541688067 + }, + "sciq": { + "acc": 0.841, + "acc_stderr": 0.01156947936827129, + "acc_norm": 0.796, + "acc_norm_stderr": 0.012749374359024384 + }, + "piqa": { + "acc": 0.7513601741022851, + "acc_stderr": 0.01008451123429685, + "acc_norm": 0.7578890097932536, + "acc_norm_stderr": 0.009994371269104397 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..cfec1a1379ef3785a474c68e2d94a790aae2ea7e --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_3_lm-eval_global_step80108_2023-01-30-11-26-31_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811485 + }, + "anli_r2": { + "acc": 0.334, + "acc_stderr": 0.014922019523732963 + }, + "anli_r3": { + "acc": 0.35, + "acc_stderr": 0.013774667009018554 + }, + "cb": { + "acc": 0.6071428571428571, + "acc_stderr": 0.0658538889806635, + "f1": 0.42400932400932395 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036622 + }, + "hellaswag": { + "acc": 0.47241585341565423, + "acc_stderr": 0.004982182323923561, + "acc_norm": 0.6199960167297351, + "acc_norm_stderr": 0.004843954338451449 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.030009848912529113 + }, + "winogrande": { + "acc": 0.5737963693764798, + "acc_stderr": 0.013898585965412338 + }, + "storycloze_2016": { + "acc": 0.7124532335649385, + "acc_stderr": 0.010466744473098363 + }, + "boolq": { + "acc": 0.5587155963302752, + "acc_stderr": 0.008684548127832637 + }, + "arc_easy": { + "acc": 0.5955387205387206, + "acc_stderr": 0.010070746648278783, + "acc_norm": 0.5740740740740741, + "acc_norm_stderr": 0.010146568651002255 + }, + "arc_challenge": { + "acc": 0.2815699658703072, + "acc_stderr": 0.013143376735009022, + "acc_norm": 0.3122866894197952, + "acc_norm_stderr": 0.013542598541688067 + }, + "sciq": { + "acc": 0.841, + "acc_stderr": 0.01156947936827129, + "acc_norm": 0.796, + "acc_norm_stderr": 0.012749374359024384 + }, + "piqa": { + "acc": 0.7513601741022851, + "acc_stderr": 0.01008451123429685, + "acc_norm": 0.7578890097932536, + "acc_norm_stderr": 0.009994371269104397 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_4.json b/4b284b12bc4/evaluation/4b284b12bc4_4.json new file mode 100644 index 0000000000000000000000000000000000000000..362c01eb141bc03ddaa0a42615c78c3520a1c857 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.337, + "acc_stderr": 0.014955087918653603 + }, + "anli_r2": { + "acc": 0.349, + "acc_stderr": 0.015080663991563102 + }, + "anli_r3": { + "acc": 0.36666666666666664, + "acc_stderr": 0.013916893275819938 + }, + "cb": { + "acc": 0.44642857142857145, + "acc_stderr": 0.067031892279424, + "f1": 0.3176100628930817 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4722166899024099, + "acc_stderr": 0.004982072108448081, + "acc_norm": 0.6184027086237801, + "acc_norm_stderr": 0.004847857546957481 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.03000984891252911 + }, + "winogrande": { + "acc": 0.56353591160221, + "acc_stderr": 0.013938569465677023 + }, + "storycloze_2016": { + "acc": 0.7194013896312133, + "acc_stderr": 0.010389809647288821 + }, + "boolq": { + "acc": 0.5636085626911315, + "acc_stderr": 0.008674000467432068 + }, + "arc_easy": { + "acc": 0.6039562289562289, + "acc_stderr": 0.010035580962097942, + "acc_norm": 0.5702861952861953, + "acc_norm_stderr": 0.010157908005763674 + }, + "arc_challenge": { + "acc": 0.2790102389078498, + "acc_stderr": 0.013106784883601346, + "acc_norm": 0.3165529010238908, + "acc_norm_stderr": 0.013592431519068077 + }, + "sciq": { + "acc": 0.842, + "acc_stderr": 0.011539894677559568, + "acc_norm": 0.789, + "acc_norm_stderr": 0.012909130321042092 + }, + "piqa": { + "acc": 0.7431991294885746, + "acc_stderr": 0.010192864802278045, + "acc_norm": 0.7568008705114254, + "acc_norm_stderr": 0.010009611953858915 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..362c01eb141bc03ddaa0a42615c78c3520a1c857 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_4_lm-eval_global_step80108_2023-01-30-11-26-32_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.337, + "acc_stderr": 0.014955087918653603 + }, + "anli_r2": { + "acc": 0.349, + "acc_stderr": 0.015080663991563102 + }, + "anli_r3": { + "acc": 0.36666666666666664, + "acc_stderr": 0.013916893275819938 + }, + "cb": { + "acc": 0.44642857142857145, + "acc_stderr": 0.067031892279424, + "f1": 0.3176100628930817 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4722166899024099, + "acc_stderr": 0.004982072108448081, + "acc_norm": 0.6184027086237801, + "acc_norm_stderr": 0.004847857546957481 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.03000984891252911 + }, + "winogrande": { + "acc": 0.56353591160221, + "acc_stderr": 0.013938569465677023 + }, + "storycloze_2016": { + "acc": 0.7194013896312133, + "acc_stderr": 0.010389809647288821 + }, + "boolq": { + "acc": 0.5636085626911315, + "acc_stderr": 0.008674000467432068 + }, + "arc_easy": { + "acc": 0.6039562289562289, + "acc_stderr": 0.010035580962097942, + "acc_norm": 0.5702861952861953, + "acc_norm_stderr": 0.010157908005763674 + }, + "arc_challenge": { + "acc": 0.2790102389078498, + "acc_stderr": 0.013106784883601346, + "acc_norm": 0.3165529010238908, + "acc_norm_stderr": 0.013592431519068077 + }, + "sciq": { + "acc": 0.842, + "acc_stderr": 0.011539894677559568, + "acc_norm": 0.789, + "acc_norm_stderr": 0.012909130321042092 + }, + "piqa": { + "acc": 0.7431991294885746, + "acc_stderr": 0.010192864802278045, + "acc_norm": 0.7568008705114254, + "acc_norm_stderr": 0.010009611953858915 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_5.json b/4b284b12bc4/evaluation/4b284b12bc4_5.json new file mode 100644 index 0000000000000000000000000000000000000000..013a059067dc8e5e4a9354909d895aeb9562a851 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_5.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811487 + }, + "anli_r2": { + "acc": 0.329, + "acc_stderr": 0.014865395385928357 + }, + "anli_r3": { + "acc": 0.3541666666666667, + "acc_stderr": 0.013811933499570954 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942395, + "f1": 0.38376730002345766 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.47400916152160927, + "acc_stderr": 0.004983035420235716, + "acc_norm": 0.619896434973113, + "acc_norm_stderr": 0.004844199910173026 + }, + "rte": { + "acc": 0.516245487364621, + "acc_stderr": 0.030080573208738064 + }, + "winogrande": { + "acc": 0.5722178374112076, + "acc_stderr": 0.013905134013839944 + }, + "storycloze_2016": { + "acc": 0.7177979690005345, + "acc_stderr": 0.010407834479647675 + }, + "boolq": { + "acc": 0.5648318042813456, + "acc_stderr": 0.008671229580582118 + }, + "arc_easy": { + "acc": 0.5997474747474747, + "acc_stderr": 0.010053550119896127, + "acc_norm": 0.569023569023569, + "acc_norm_stderr": 0.010161552863493746 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json b/4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..013a059067dc8e5e4a9354909d895aeb9562a851 --- /dev/null +++ b/4b284b12bc4/evaluation/4b284b12bc4_5_lm-eval_global_step80108_2023-01-30-11-26-32_5shots_backup.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811487 + }, + "anli_r2": { + "acc": 0.329, + "acc_stderr": 0.014865395385928357 + }, + "anli_r3": { + "acc": 0.3541666666666667, + "acc_stderr": 0.013811933499570954 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942395, + "f1": 0.38376730002345766 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.47400916152160927, + "acc_stderr": 0.004983035420235716, + "acc_norm": 0.619896434973113, + "acc_norm_stderr": 0.004844199910173026 + }, + "rte": { + "acc": 0.516245487364621, + "acc_stderr": 0.030080573208738064 + }, + "winogrande": { + "acc": 0.5722178374112076, + "acc_stderr": 0.013905134013839944 + }, + "storycloze_2016": { + "acc": 0.7177979690005345, + "acc_stderr": 0.010407834479647675 + }, + "boolq": { + "acc": 0.5648318042813456, + "acc_stderr": 0.008671229580582118 + }, + "arc_easy": { + "acc": 0.5997474747474747, + "acc_stderr": 0.010053550119896127, + "acc_norm": 0.569023569023569, + "acc_norm_stderr": 0.010161552863493746 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6267ad1e49b3823b2f7291762855b52190024d9c --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4070835356827751, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03514958095848397 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0758536616906455, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015747064380670645 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3264375465319237, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004888854445231445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11509298027342854, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002040147114373331 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03493638633069714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009342574915112234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15766160622381195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033114573324024405 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0532813862747049, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012579627803205211 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07257604824526195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014483785678009685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31637706878833355, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004769735504597033 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1105412242108245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019072286738988954 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0714774644843108, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014699104009543759 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.307939556685913, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004520814685280998 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10843545057843905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019083088150967664 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3538b79158a1c8eb4e80039636223d384f2e1c27 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.41914858834195134, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.030279335876129 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07536633674836868, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001620641410096321 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3290768382699901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00481767508183653 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11424698089656772, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001973221738343803 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03540467062379218, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001074817084017668 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16089821041540717, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0033011630774406127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05368591058094131, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012551880063213156 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07231503158237214, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015163361416883465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3189205930522712, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004694857387684187 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10991123942051419, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018557651460448018 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07148579673935408, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015357817111525064 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3110112645350247, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00441643475943137 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1082043807305256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018480349337665876 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6cdf55cc91041391482ee0ed01f3925f2acc9262 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4241874936612034, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03699728854949305 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07469786641233617, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015771153206732972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32891693541469197, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004751520151482175 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11375522621692136, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019642936162507533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03462695918297652, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009079391487918842 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.16210166248343671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003411098262587952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05344291957030947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001233885317072834 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07161852924412807, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014761860684115284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31797917629392425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004598849198704314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10934081987838024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018415550723111455 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0711214967812572, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00149377360051449 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3130870814045286, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004421211065212564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10823991385374933, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018380668821100924 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f803c2904d39a540f9b847de6409dfc746709451 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3916994292697065, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02655023153261868 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07713695315738618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018521617901133295 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.32641437991318506, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004583689746653368 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11443103117633296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019845366723218495 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.036319480745632425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012079439649413412 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15985213856119682, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003223582265695079 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05368996382308088, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012403567348119643 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07333561421491072, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017170202297610163 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3129899723170166, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004403504671395443 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10920281437234497, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018472429946517301 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07332584684616669, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017553031622823821 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31070372160179327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004314602758278112 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10882089385505, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018674885337082484 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b643df2e31f5455e6f16882cd1453fe9b04b2241 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.37875018794247045, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.024296780304434905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07231813177556075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015118246585830762 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31870699434574523, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046463072458484975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1102139471634063, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019620155943445507 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.033695317164630666, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009163247572691914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15554105747469235, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003236397171744527 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0515680827205002, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001213141047008391 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06901574750871842, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013947803448898716 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.30638147232578594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004472401624140904 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1054595308766645, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018290764447497754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06885871876103705, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001420315845279487 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30338100654345734, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0043561963135752306 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10492767242713451, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001836832016012516 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c007e9ee44de7d4b4c85788fd2e7bd1f4ce5b0a6 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3689406693649318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.01833284872989782 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0725733890131515, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0016541722828599028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31647681542290346, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004649887369574888 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10942321553706275, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001960578009336271 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.0340511038621137, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001109810266658632 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1539259113242296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003295678214536681 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05107734688924233, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00122669666906548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06920155517233274, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015510532870385023 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.30372126675172995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004472388579309833 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10453686352175766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018279045748061345 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0688242602910231, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015744817424633028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3000075619025587, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004328528641012373 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1036182486745027, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018321897232056268 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..2096046730b45b46234ea0ade8397c39c4fe1533 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.0505257980847339, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009316535384306269 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07944574030730557, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013436416711421135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05739499438745971, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000959404112224303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0025522430280765624, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00019188137819214202 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003889530091650386, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0003502492712853139 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002874313185982406, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00022791640812011725 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04641940622808299, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007892141876172595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.07403750267734954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011941764542527037 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.053008425140295905, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0008201296289562219 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.04816018092226108, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008696938078640268 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07599978162074517, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012632037879506343 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05476968826480647, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008953065390907387 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.14859459498800928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.019538924284114197 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..25fc3227de82f89d44e1e60d40ab4d4b2b3ed8ad --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.13584035826185337, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001883003830967405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2300655738827389, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002771815067524841 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.15836293018887368, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001891640433790272 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02400583408187123, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007285343955308211 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.04289680599794199, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001353605133009988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.028190707681194575, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008118715649523407 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.10298984208878467, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012746961770117027 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.18009351587124983, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002104050870387258 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12130392597137107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012906736654645788 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.12668865175777289, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017429813042225584 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21558088869876474, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026000978058887433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.14788907195330203, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017520956258023405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.346869321685321, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06607443264134674 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..f7c8e08291dba4cc08d19b8b3ca567deb99b0fe7 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1708173947979992, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002081500885534108 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2857546859749413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002766073063501205 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1979943409707515, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019701979235594003 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.03867526801746755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009249837176721441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06488659439579164, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015151297174771778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04456119604899187, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009779372383836055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12436000788073821, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013936495773448447 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21512580022359384, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021392819398317084 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14561448339286645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013308339769298708 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.15896932268065497, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0019322450604166179 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2668532631896298, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00261075911429361 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.18443888049277404, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018323545176278458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.247794388992107, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09928029909737168 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..74d7f6ff747d175dddce0162bba7676f72b3a76b --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.15395480397824707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002402542953049042 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.24557521515724243, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0032199609208037362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17165885454358776, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002266523062316873 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0345849314932535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000925086814362582 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05685215254790676, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014741611375388177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03887583188926559, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009318869486006191 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11262382269704937, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001735467609168603 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1853949845751863, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002495030067359919 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.12611107179368627, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015753873989903184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1426484686620457, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0022377962576445335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.22876101619014125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0030299996388814796 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15920866146470747, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021025779963518188 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.212268753332442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09749124513916169 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5df963c364f8d10e2859eb9f36a408618ec687a5 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.05323367871163294, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002019764759552398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08477832633294664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029256767843087337 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.057194516095400834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019532780600384314 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.011892613439611567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0006525291351364055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.021014952711012305, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011420293722952714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.013407675922368708, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006687476239191229 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.03994947969161068, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015323136750776513 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0654114763710059, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022890377474177525 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0429641799328658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014362645716637073 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.049203521589357486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018745212624263795 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07828616324493042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00270998239625019 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.052823275351703086, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018085153746161442 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.5159665881377578, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0355848354666848 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..4508a5a68cbd336c36d7d336e9a676232b49ea58 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.008397509125114434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008676037544483993 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.013182384245950918, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0012488579545023588 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.00885537771521418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008474145806800235 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0018519121687661717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002652232008037664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.003201546871291623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00042951400653203656 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0020845828252393957, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002776449859686965 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.006312806712827651, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006339300731280119 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010466170637582555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010119772330203227 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006795329450745382, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006430172034977336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.0076894646083944945, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.000786099271409283 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012227793486074013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011695312324965957 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008112137370163754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007719985459243486 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 5.133528491740168e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 9.288876136024227e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6aca0ee21835e3099fc3b6ca43b3c5ff42ea282a --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.02601889547824242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.008845990174481217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.013188289488289925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0003020110162685144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.02331912229418711, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0005487407492538043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.016342850382717815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00034912236129064063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.00018882696164487857, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 4.0143613745290394e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.0003925072247489244, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 9.925690772584578e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.00024104025657346095, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 5.3010616942847675e-05 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.013188289488289925, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0003020110162685144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.02331912229418711, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0005487407492538043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.016342850382717815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00034912236129064063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.012772945572946016, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.000289959796992291 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.022577540598345845, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0005237847970514886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.015820908294670494, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00033227991604236137 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..67ba8777dfe9425951449777e51900ec53b6f2af --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.11194167971178, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.11325409958385195 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.44662580344767955, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029145470980427935 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.403325138761631, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029812540605776657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.396708212066539, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0023271634805327288 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.19849995481108837, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0023301034372897886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.17476709268624893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0019834376005625296 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1714205638298909, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017391694364972787 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3259862976774872, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0025491424613272398 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.28928330619087117, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002329063084463768 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28469304359904984, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0018043411811584805 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.37806486217321694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002787004218176944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3367501905387613, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002641205675752919 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3323030796627126, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002111384569072981 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c9f15194d1954169a1891835c4d140817882f416 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.765851233592166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1147637687545087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5029109698404333, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032326251122724503 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.399072749631299, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002770754484292409 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4194853660757534, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002222220598248002 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.23608052397188775, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002567411684370418 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18324767332759007, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001989894281113984 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19259169221915515, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018423181170468268 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3660346353864222, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0028533265797324394 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.28763635547225785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002249079446710024 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3027130154416743, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001900798692043736 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4152150772992765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0030631524445173153 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.32804484742896467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002520109021821381 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.34512069878778673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00213278119755698 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..1e1c42fa37c491627756dbbb5af93bc35b0e76a8 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.504414399066166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14144404460789148 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.509958754752616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003154730928607417 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.3954899202585752, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002608601302312822 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.42179412730593185, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0021527795098671134 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24242098353513022, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002484157206893866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.18360253637850166, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001904297402323581 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19636018570824587, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018014064871279597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.36777401786978037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002648283289980144 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.28409076327843025, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002115488452238855 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.30275824983777616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0018031245552577217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4185549498251625, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0029283489596877298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.32342169868683124, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023588979812351725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3452111361624014, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020448977561221436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..df2e72f6883f27c759821c2c478651e60cd0509f --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.453461006006084, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.20323399299325623 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5152975398825912, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032876121566522126 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.3875757012647283, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002563136912882847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4184745538314975, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002152424221911221 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24767323400172267, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002578983235298861 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1802765312255684, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018268424519338505 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19590832872090894, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017695553874619732 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.37129359217610464, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0027272181980460375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.2780682094337028, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0020775510109816452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.30013711995116676, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001799774395616833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4223690533102043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003005111083307187 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3169024351381635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0023241001320442878 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.34206938848652774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020120604230570572 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..738a292d845d4076189f24181902fff96dce1fba --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.336987597938899, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.20513507856533955 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5143545157562858, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003337934175013788 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.3857868235592206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0024923995035469678 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4170315564856164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002124259039598598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.24631251817723201, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002657555991369566 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.17787346956272435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.001792217462473695 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19402158147865167, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017876994973534497 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.37299125515052484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002815154394732632 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.27850203471081203, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002038628599689064 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3009842944910936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0018055725861154817 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.42379837225389067, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031034115053880863 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.31679500554979756, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002259000761426462 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3425854571657328, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020124639191903327 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..63f99f27f9707e2a6cb1d631aa86b02f97a78c9c --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.08057623396604993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018427190672612415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.1903611222785915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004119319495363413 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.11144536072130062, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002449613618907836 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.012439638488265747, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007674450472725088 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.029923546174765742, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018325311907880977 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01730052045113504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010550283614850532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.06836163772266587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001418824283099982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.16290273051352785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0032456160924942976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.09482469984719238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001891115330305303 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.06720724706425169, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001458820899362241 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.16056752379802697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003380030301216589 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.09329416183941738, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0019565438808381327 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.711911214189282, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.062271095680873176 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..2743fb5f4b5f4501f6601349b688bd9b78ffadd5 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.08658830544513134, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018612611645494558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.21204874097915127, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004437751832600217 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.12151658666281231, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025542911124592704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.013510304634606875, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007312649740318255 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.03415295300257043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0019425669982816587 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.019132118327200527, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010395559026960023 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0702987343925042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013468612933160927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.17295883718094748, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033149082764143117 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.09875044840716671, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018572603815356456 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.07169769773353042, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001454220219767976 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.17642520944692217, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035711244979865823 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.10073542075586238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002008424522536892 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.7485653629026496, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10161315249240252 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..48364336d190f4cbeccb986baeb23c391639d9a5 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.10919972863971208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018326070695569962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.27085077984418787, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004340987074007443 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.15372555310846955, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024917036123975646 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.018660104842681158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0008521301906071727 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.048121178106264206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002236763054096929 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.026545543337132424, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001202505861016018 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.08598679476507894, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001347479595633704 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.21491972476150967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033319697703370205 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.12124448179130719, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018406701231741705 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.08936454412712036, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00147707143314301 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.22313932236999878, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003637762226705496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.12600811759241443, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002023272430841333 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.0047358326681721, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07696741647689843 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..ef1771ffa2a9a05c597916e0834a8d9d334179c0 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.12156176985300436, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019916743006548566 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.2919486745230574, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004594202930152972 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.16788870363340208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026015869149755492 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.023390561110934703, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009482124699139898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.05931283277905219, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00253961685376344 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03291830334125208, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013348595001679636 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.09410425031338661, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001505615984367887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.22729829891507616, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035793671647219765 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.12998891345659275, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00194502365003499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.09794367331761664, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016077634502078913 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.23679450093245694, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003856459984346274 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.13536430596241408, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020963457328233175 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.3876528749760366, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09352517366139018 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b262435c67fac50562dcc425ee208b5f1e999d5 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04061391536633297, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025885117246031656 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0774019664424454, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004501264779451914 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.04817132624367883, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002710318861619294 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.007571424737600509, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000774569805719568 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.017306355810365114, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017078169253319931 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.010104068388385765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009810253225945517 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03226926320041072, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021853447839680425 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0606544963878778, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035695265285015203 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.03758941214391744, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002107269506833207 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03361123521194327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002255713167385482 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.063038545340123, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003730201727070476 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.03925168824333697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022220799806552142 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.8091606018729823, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.13365493953263705 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a2be15bcd28d3ecc262bc7fc106ded5180d3f4b1 --- /dev/null +++ b/4b284b12bc4/evaluation/generation/slim.4b284b12bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0025292500918997793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006843948420078455 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0020532297175197265, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005556296741534733 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002218415772902317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0005977307451849004 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004376650603065697, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00024169751059179606 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00040004436924525715, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0002447488200268485 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.00041371259854665804, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0002415623180229552 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0021231909904583543, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0005742362353931501 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0017609197535564964, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004851127145778472 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0018828340816919485, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005121954193145257 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002170837264519722, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005838940116621144 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0017952250708806817, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004910386484249093 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0019227239855572795, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005197852399034371 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.9417748605436574e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.644365126953672e-33 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b12bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_0.json b/4b284b17bc4/evaluation/4b284b17bc4_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6a235436eb04fb33ff7a7cb572bbde1279ced56f --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811478 + }, + "anli_r2": { + "acc": 0.329, + "acc_stderr": 0.014865395385928362 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.013759437498874075 + }, + "cb": { + "acc": 0.5714285714285714, + "acc_stderr": 0.06672848092813058, + "f1": 0.3888888888888889 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283 + }, + "hellaswag": { + "acc": 0.469627564230233, + "acc_stderr": 0.004980566907790459, + "acc_norm": 0.6134236207926708, + "acc_norm_stderr": 0.004859699562451462 + }, + "rte": { + "acc": 0.5415162454873647, + "acc_stderr": 0.029992535385373314 + }, + "winogrande": { + "acc": 0.5737963693764798, + "acc_stderr": 0.013898585965412338 + }, + "storycloze_2016": { + "acc": 0.7108498129342598, + "acc_stderr": 0.010484068799942072 + }, + "boolq": { + "acc": 0.5623853211009174, + "acc_stderr": 0.008676717715731632 + }, + "arc_easy": { + "acc": 0.6052188552188552, + "acc_stderr": 0.010030038935883584, + "acc_norm": 0.5429292929292929, + "acc_norm_stderr": 0.01022189756425604 + }, + "arc_challenge": { + "acc": 0.26791808873720135, + "acc_stderr": 0.012942030195136437, + "acc_norm": 0.2883959044368601, + "acc_norm_stderr": 0.013238394422428171 + }, + "sciq": { + "acc": 0.852, + "acc_stderr": 0.011234866364235235, + "acc_norm": 0.764, + "acc_norm_stderr": 0.013434451402438678 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.00999437126910438, + "acc_norm": 0.7622415669205659, + "acc_norm_stderr": 0.009932525779525492 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..6a235436eb04fb33ff7a7cb572bbde1279ced56f --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811478 + }, + "anli_r2": { + "acc": 0.329, + "acc_stderr": 0.014865395385928362 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.013759437498874075 + }, + "cb": { + "acc": 0.5714285714285714, + "acc_stderr": 0.06672848092813058, + "f1": 0.3888888888888889 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283 + }, + "hellaswag": { + "acc": 0.469627564230233, + "acc_stderr": 0.004980566907790459, + "acc_norm": 0.6134236207926708, + "acc_norm_stderr": 0.004859699562451462 + }, + "rte": { + "acc": 0.5415162454873647, + "acc_stderr": 0.029992535385373314 + }, + "winogrande": { + "acc": 0.5737963693764798, + "acc_stderr": 0.013898585965412338 + }, + "storycloze_2016": { + "acc": 0.7108498129342598, + "acc_stderr": 0.010484068799942072 + }, + "boolq": { + "acc": 0.5623853211009174, + "acc_stderr": 0.008676717715731632 + }, + "arc_easy": { + "acc": 0.6052188552188552, + "acc_stderr": 0.010030038935883584, + "acc_norm": 0.5429292929292929, + "acc_norm_stderr": 0.01022189756425604 + }, + "arc_challenge": { + "acc": 0.26791808873720135, + "acc_stderr": 0.012942030195136437, + "acc_norm": 0.2883959044368601, + "acc_norm_stderr": 0.013238394422428171 + }, + "sciq": { + "acc": 0.852, + "acc_stderr": 0.011234866364235235, + "acc_norm": 0.764, + "acc_norm_stderr": 0.013434451402438678 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.00999437126910438, + "acc_norm": 0.7622415669205659, + "acc_norm_stderr": 0.009932525779525492 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_1.json b/4b284b17bc4/evaluation/4b284b17bc4_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4572b5ea5b09badd72d95a263315a8e40e583db3 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.32, + "acc_stderr": 0.014758652303574886 + }, + "anli_r2": { + "acc": 0.324, + "acc_stderr": 0.014806864733738854 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.01376707539507725 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942397, + "f1": 0.3890671420083185 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.4640509858593906, + "acc_stderr": 0.0049768677965835555, + "acc_norm": 0.6082453694483171, + "acc_norm_stderr": 0.004871447106554927 + }, + "rte": { + "acc": 0.5451263537906137, + "acc_stderr": 0.029973636495415252 + }, + "winogrande": { + "acc": 0.574585635359116, + "acc_stderr": 0.013895257666646378 + }, + "storycloze_2016": { + "acc": 0.711918760021379, + "acc_stderr": 0.010472537019822582 + }, + "boolq": { + "acc": 0.5409785932721712, + "acc_stderr": 0.008715635308774412 + }, + "arc_easy": { + "acc": 0.6342592592592593, + "acc_stderr": 0.009882988069418829, + "acc_norm": 0.5837542087542088, + "acc_norm_stderr": 0.01011481940450087 + }, + "arc_challenge": { + "acc": 0.2901023890784983, + "acc_stderr": 0.013261573677520764, + "acc_norm": 0.30119453924914674, + "acc_norm_stderr": 0.013406741767847638 + }, + "sciq": { + "acc": 0.896, + "acc_stderr": 0.009658016218524301, + "acc_norm": 0.88, + "acc_norm_stderr": 0.010281328012747386 + }, + "piqa": { + "acc": 0.7551686615886833, + "acc_stderr": 0.010032309105568793, + "acc_norm": 0.766050054406964, + "acc_norm_stderr": 0.009877236895137436 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..4572b5ea5b09badd72d95a263315a8e40e583db3 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.32, + "acc_stderr": 0.014758652303574886 + }, + "anli_r2": { + "acc": 0.324, + "acc_stderr": 0.014806864733738854 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.01376707539507725 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942397, + "f1": 0.3890671420083185 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.4640509858593906, + "acc_stderr": 0.0049768677965835555, + "acc_norm": 0.6082453694483171, + "acc_norm_stderr": 0.004871447106554927 + }, + "rte": { + "acc": 0.5451263537906137, + "acc_stderr": 0.029973636495415252 + }, + "winogrande": { + "acc": 0.574585635359116, + "acc_stderr": 0.013895257666646378 + }, + "storycloze_2016": { + "acc": 0.711918760021379, + "acc_stderr": 0.010472537019822582 + }, + "boolq": { + "acc": 0.5409785932721712, + "acc_stderr": 0.008715635308774412 + }, + "arc_easy": { + "acc": 0.6342592592592593, + "acc_stderr": 0.009882988069418829, + "acc_norm": 0.5837542087542088, + "acc_norm_stderr": 0.01011481940450087 + }, + "arc_challenge": { + "acc": 0.2901023890784983, + "acc_stderr": 0.013261573677520764, + "acc_norm": 0.30119453924914674, + "acc_norm_stderr": 0.013406741767847638 + }, + "sciq": { + "acc": 0.896, + "acc_stderr": 0.009658016218524301, + "acc_norm": 0.88, + "acc_norm_stderr": 0.010281328012747386 + }, + "piqa": { + "acc": 0.7551686615886833, + "acc_stderr": 0.010032309105568793, + "acc_norm": 0.766050054406964, + "acc_norm_stderr": 0.009877236895137436 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_2.json b/4b284b17bc4/evaluation/4b284b17bc4_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3e49aa8d33cfba49045ebe2954fa4cb4c5d0b629 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.343, + "acc_stderr": 0.015019206922356953 + }, + "anli_r2": { + "acc": 0.318, + "acc_stderr": 0.014734079309311901 + }, + "anli_r3": { + "acc": 0.325, + "acc_stderr": 0.013526454480351028 + }, + "cb": { + "acc": 0.42857142857142855, + "acc_stderr": 0.06672848092813058, + "f1": 0.3058470764617691 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263 + }, + "hellaswag": { + "acc": 0.45727942640908187, + "acc_stderr": 0.004971534874389935, + "acc_norm": 0.602867954590719, + "acc_norm_stderr": 0.004883037758919964 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.5808997632202052, + "acc_stderr": 0.013867325192210116 + }, + "storycloze_2016": { + "acc": 0.7215392838054516, + "acc_stderr": 0.010365521460604415 + }, + "boolq": { + "acc": 0.5489296636085627, + "acc_stderr": 0.008703080962379622 + }, + "arc_easy": { + "acc": 0.6325757575757576, + "acc_stderr": 0.009892552616211558, + "acc_norm": 0.617003367003367, + "acc_norm_stderr": 0.009974920384536479 + }, + "arc_challenge": { + "acc": 0.2901023890784983, + "acc_stderr": 0.013261573677520759, + "acc_norm": 0.31313993174061433, + "acc_norm_stderr": 0.013552671543623496 + }, + "sciq": { + "acc": 0.906, + "acc_stderr": 0.009233052000787738, + "acc_norm": 0.891, + "acc_norm_stderr": 0.009859828407037186 + }, + "piqa": { + "acc": 0.7540805223068553, + "acc_stderr": 0.010047331865625194, + "acc_norm": 0.7698585418933623, + "acc_norm_stderr": 0.009820832826839796 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..3e49aa8d33cfba49045ebe2954fa4cb4c5d0b629 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.343, + "acc_stderr": 0.015019206922356953 + }, + "anli_r2": { + "acc": 0.318, + "acc_stderr": 0.014734079309311901 + }, + "anli_r3": { + "acc": 0.325, + "acc_stderr": 0.013526454480351028 + }, + "cb": { + "acc": 0.42857142857142855, + "acc_stderr": 0.06672848092813058, + "f1": 0.3058470764617691 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932263 + }, + "hellaswag": { + "acc": 0.45727942640908187, + "acc_stderr": 0.004971534874389935, + "acc_norm": 0.602867954590719, + "acc_norm_stderr": 0.004883037758919964 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.5808997632202052, + "acc_stderr": 0.013867325192210116 + }, + "storycloze_2016": { + "acc": 0.7215392838054516, + "acc_stderr": 0.010365521460604415 + }, + "boolq": { + "acc": 0.5489296636085627, + "acc_stderr": 0.008703080962379622 + }, + "arc_easy": { + "acc": 0.6325757575757576, + "acc_stderr": 0.009892552616211558, + "acc_norm": 0.617003367003367, + "acc_norm_stderr": 0.009974920384536479 + }, + "arc_challenge": { + "acc": 0.2901023890784983, + "acc_stderr": 0.013261573677520759, + "acc_norm": 0.31313993174061433, + "acc_norm_stderr": 0.013552671543623496 + }, + "sciq": { + "acc": 0.906, + "acc_stderr": 0.009233052000787738, + "acc_norm": 0.891, + "acc_norm_stderr": 0.009859828407037186 + }, + "piqa": { + "acc": 0.7540805223068553, + "acc_stderr": 0.010047331865625194, + "acc_norm": 0.7698585418933623, + "acc_norm_stderr": 0.009820832826839796 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_3.json b/4b284b17bc4/evaluation/4b284b17bc4_3.json new file mode 100644 index 0000000000000000000000000000000000000000..408779e0bfcec61f04346c964e88daf60d169fc6 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.348, + "acc_stderr": 0.015070604603768408 + }, + "anli_r2": { + "acc": 0.36, + "acc_stderr": 0.01518652793204012 + }, + "anli_r3": { + "acc": 0.35083333333333333, + "acc_stderr": 0.013782212417178195 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.40387403446226977 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4567815176259709, + "acc_stderr": 0.004971106265046551, + "acc_norm": 0.5992830113523202, + "acc_norm_stderr": 0.004890422457747258 + }, + "rte": { + "acc": 0.48375451263537905, + "acc_stderr": 0.030080573208738064 + }, + "winogrande": { + "acc": 0.569060773480663, + "acc_stderr": 0.013917796623335966 + }, + "storycloze_2016": { + "acc": 0.7247461250668092, + "acc_stderr": 0.010328538400500567 + }, + "boolq": { + "acc": 0.5498470948012233, + "acc_stderr": 0.008701488203356937 + }, + "arc_easy": { + "acc": 0.6266835016835017, + "acc_stderr": 0.009925009142802903, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.009958037725468558 + }, + "arc_challenge": { + "acc": 0.2901023890784983, + "acc_stderr": 0.013261573677520769, + "acc_norm": 0.31143344709897613, + "acc_norm_stderr": 0.013532472099850949 + }, + "sciq": { + "acc": 0.923, + "acc_stderr": 0.008434580140240632, + "acc_norm": 0.903, + "acc_norm_stderr": 0.00936368937324812 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.009994371269104387, + "acc_norm": 0.7682263329706203, + "acc_norm_stderr": 0.00984514377279405 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..408779e0bfcec61f04346c964e88daf60d169fc6 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.348, + "acc_stderr": 0.015070604603768408 + }, + "anli_r2": { + "acc": 0.36, + "acc_stderr": 0.01518652793204012 + }, + "anli_r3": { + "acc": 0.35083333333333333, + "acc_stderr": 0.013782212417178195 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.40387403446226977 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4567815176259709, + "acc_stderr": 0.004971106265046551, + "acc_norm": 0.5992830113523202, + "acc_norm_stderr": 0.004890422457747258 + }, + "rte": { + "acc": 0.48375451263537905, + "acc_stderr": 0.030080573208738064 + }, + "winogrande": { + "acc": 0.569060773480663, + "acc_stderr": 0.013917796623335966 + }, + "storycloze_2016": { + "acc": 0.7247461250668092, + "acc_stderr": 0.010328538400500567 + }, + "boolq": { + "acc": 0.5498470948012233, + "acc_stderr": 0.008701488203356937 + }, + "arc_easy": { + "acc": 0.6266835016835017, + "acc_stderr": 0.009925009142802903, + "acc_norm": 0.6203703703703703, + "acc_norm_stderr": 0.009958037725468558 + }, + "arc_challenge": { + "acc": 0.2901023890784983, + "acc_stderr": 0.013261573677520769, + "acc_norm": 0.31143344709897613, + "acc_norm_stderr": 0.013532472099850949 + }, + "sciq": { + "acc": 0.923, + "acc_stderr": 0.008434580140240632, + "acc_norm": 0.903, + "acc_norm_stderr": 0.00936368937324812 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.009994371269104387, + "acc_norm": 0.7682263329706203, + "acc_norm_stderr": 0.00984514377279405 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_4.json b/4b284b17bc4/evaluation/4b284b17bc4_4.json new file mode 100644 index 0000000000000000000000000000000000000000..efe239b5e466409417c045d2c72414349219ebf4 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.36, + "acc_stderr": 0.015186527932040117 + }, + "anli_r2": { + "acc": 0.347, + "acc_stderr": 0.015060472031706625 + }, + "anli_r3": { + "acc": 0.3625, + "acc_stderr": 0.01388303787422552 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942395, + "f1": 0.4538378958668814 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.45180242979486157, + "acc_stderr": 0.004966544724452227, + "acc_norm": 0.5955984863572994, + "acc_norm_stderr": 0.004897728370737246 + }, + "rte": { + "acc": 0.48375451263537905, + "acc_stderr": 0.030080573208738064 + }, + "winogrande": { + "acc": 0.5706393054459353, + "acc_stderr": 0.013911537499969163 + }, + "storycloze_2016": { + "acc": 0.7177979690005345, + "acc_stderr": 0.010407834479647672 + }, + "boolq": { + "acc": 0.545565749235474, + "acc_stderr": 0.008708665643758015 + }, + "arc_easy": { + "acc": 0.640993265993266, + "acc_stderr": 0.009843424713072174, + "acc_norm": 0.6186868686868687, + "acc_norm_stderr": 0.009966542497171025 + }, + "arc_challenge": { + "acc": 0.302901023890785, + "acc_stderr": 0.013428241573185349, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012129 + }, + "sciq": { + "acc": 0.915, + "acc_stderr": 0.008823426366942331, + "acc_norm": 0.912, + "acc_norm_stderr": 0.008963053962592085 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.009994371269104385, + "acc_norm": 0.7752992383025027, + "acc_norm_stderr": 0.009738282586548389 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..efe239b5e466409417c045d2c72414349219ebf4 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.36, + "acc_stderr": 0.015186527932040117 + }, + "anli_r2": { + "acc": 0.347, + "acc_stderr": 0.015060472031706625 + }, + "anli_r3": { + "acc": 0.3625, + "acc_stderr": 0.01388303787422552 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942395, + "f1": 0.4538378958668814 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.45180242979486157, + "acc_stderr": 0.004966544724452227, + "acc_norm": 0.5955984863572994, + "acc_norm_stderr": 0.004897728370737246 + }, + "rte": { + "acc": 0.48375451263537905, + "acc_stderr": 0.030080573208738064 + }, + "winogrande": { + "acc": 0.5706393054459353, + "acc_stderr": 0.013911537499969163 + }, + "storycloze_2016": { + "acc": 0.7177979690005345, + "acc_stderr": 0.010407834479647672 + }, + "boolq": { + "acc": 0.545565749235474, + "acc_stderr": 0.008708665643758015 + }, + "arc_easy": { + "acc": 0.640993265993266, + "acc_stderr": 0.009843424713072174, + "acc_norm": 0.6186868686868687, + "acc_norm_stderr": 0.009966542497171025 + }, + "arc_challenge": { + "acc": 0.302901023890785, + "acc_stderr": 0.013428241573185349, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012129 + }, + "sciq": { + "acc": 0.915, + "acc_stderr": 0.008823426366942331, + "acc_norm": 0.912, + "acc_norm_stderr": 0.008963053962592085 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.009994371269104385, + "acc_norm": 0.7752992383025027, + "acc_norm_stderr": 0.009738282586548389 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_5.json b/4b284b17bc4/evaluation/4b284b17bc4_5.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8c4c2b41c1fc72c753d73d7b2e4bde3577f7e0 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_5.json @@ -0,0 +1,73 @@ +{ + "results": { + "anli_r1": { + "acc": 0.363, + "acc_stderr": 0.015213890444671281 + }, + "anli_r2": { + "acc": 0.347, + "acc_stderr": 0.015060472031706624 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.013680495725767794 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942397, + "f1": 0.3974410235905637 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.44981079466241786, + "acc_stderr": 0.004964579685712439, + "acc_norm": 0.6002788289185421, + "acc_norm_stderr": 0.004888398535520516 + }, + "rte": { + "acc": 0.49097472924187724, + "acc_stderr": 0.030091559826331334 + }, + "winogrande": { + "acc": 0.5785319652722968, + "acc_stderr": 0.013878072377497603 + }, + "storycloze_2016": { + "acc": 0.7113842864778194, + "acc_stderr": 0.01047831178564294 + }, + "boolq": { + "acc": 0.5376146788990825, + "acc_stderr": 0.008720273736433679 + }, + "arc_easy": { + "acc": 0.6447811447811448, + "acc_stderr": 0.009820245899287117, + "acc_norm": 0.625, + "acc_norm_stderr": 0.009933992677987828 + }, + "arc_challenge": { + "acc": 0.2986348122866894, + "acc_stderr": 0.013374078615068756, + "acc_norm": 0.310580204778157, + "acc_norm_stderr": 0.013522292098053052 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json b/4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..dd8c4c2b41c1fc72c753d73d7b2e4bde3577f7e0 --- /dev/null +++ b/4b284b17bc4/evaluation/4b284b17bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json @@ -0,0 +1,73 @@ +{ + "results": { + "anli_r1": { + "acc": 0.363, + "acc_stderr": 0.015213890444671281 + }, + "anli_r2": { + "acc": 0.347, + "acc_stderr": 0.015060472031706624 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.013680495725767794 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942397, + "f1": 0.3974410235905637 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.44981079466241786, + "acc_stderr": 0.004964579685712439, + "acc_norm": 0.6002788289185421, + "acc_norm_stderr": 0.004888398535520516 + }, + "rte": { + "acc": 0.49097472924187724, + "acc_stderr": 0.030091559826331334 + }, + "winogrande": { + "acc": 0.5785319652722968, + "acc_stderr": 0.013878072377497603 + }, + "storycloze_2016": { + "acc": 0.7113842864778194, + "acc_stderr": 0.01047831178564294 + }, + "boolq": { + "acc": 0.5376146788990825, + "acc_stderr": 0.008720273736433679 + }, + "arc_easy": { + "acc": 0.6447811447811448, + "acc_stderr": 0.009820245899287117, + "acc_norm": 0.625, + "acc_norm_stderr": 0.009933992677987828 + }, + "arc_challenge": { + "acc": 0.2986348122866894, + "acc_stderr": 0.013374078615068756, + "acc_norm": 0.310580204778157, + "acc_norm_stderr": 0.013522292098053052 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6d356d8b8910e1f821c49e61c322bb3a86bcc28d --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4040857346605273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04358756352339084 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0759904796250538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019478615830651011 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3009878218567671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0046586299284223885 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11153940555452811, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002178375350508395 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03649587266689502, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001284173883599284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14683508450255534, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032437032345681857 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05308201459552208, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013729761880117914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07180766426825755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001761388989113738 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.28987291523705844, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0045326872175802165 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10594741371659425, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019835178597520093 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07245769602542276, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.00184585307065297 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2877191217238231, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004439207690226351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10637513260264994, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00204505600505615 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..5d8a9e4740735e2983eedc4dcdcac5cddd876fda --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5179012826475189, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03546328546887922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07590692259956473, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015260502670476222 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3587176031003754, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005304902318303979 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11715894355967386, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019757967913343107 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03541306461486918, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009438407351314926 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17490815925289047, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003575953294927914 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.054620931903283015, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001228796960295478 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07063353254188104, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013315866040551792 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33926359580036936, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004980489876121937 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10948455596662156, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017556314356608658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07145722539365447, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014201666549177136 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.338101035904291, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004861723234525834 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11027827926137282, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018206662711825689 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..10a25febd638271752d42841e8eaf43a4ec01f92 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5353533242406296, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03431413900192352 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07083033588676813, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012393618247087826 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3713979630102006, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0052635629154557445 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11225032910377221, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001729127211467464 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.031078186918670876, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007042214847925669 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17869319720275487, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037152494180892654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04972618028817665, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00102253346010308 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06478794864331923, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010868099908157105 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.342386434900962, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004791381155468985 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10286097604129013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001521136220014524 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0666711337015857, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011558366976764654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3495506191555596, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004867024005105769 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10565101438548488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016072411783332626 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8b5a7dd5035ef59b7e0b596e971f23b558f1fe32 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4473435878442557, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.021920475877328035 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.060184346315459554, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001087884777571127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3414484174381052, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005172668914728109 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.09638137782837077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014829064557002423 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.02365049876833611, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0005803565276567791 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14942490983015533, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003513250894148731 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.038282775688304856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008484797038289617 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.05398887767175152, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0009593908114878796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.306677112968288, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004574894720404393 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.08646072645548598, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013059843771721818 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.05643326067698096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0010263660613837862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.31950364177604385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0047753784913865055 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.09025064736984334, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0013804266464441872 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..61cead9dd3ec07a5744397205ead9a468876a29c --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.42391682641977435, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.025185202302157747 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.056211864304467056, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010231526331645241 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3243551913604374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005025965693402745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.09021138584698546, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0014520692052325743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.021734837271928865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000537789346158442 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1381516300501002, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003431494708848335 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.035274756528572794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008164665662699456 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.050592287065444816, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008807998394512943 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.29101119493777294, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004368680842712586 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.08124379060939273, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00126253468227921 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.052801000471684074, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009509504143137485 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.30284581980574177, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004598745476101752 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.08470874404935472, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001361092950551602 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..217dcbffe07508c6753b10103a0866e60e43807f --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4676760272424504, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02303026154350977 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.06321141539943463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001223200335826357 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.34518893665726086, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005046837181813745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10005721103740961, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016082944292364137 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.027408388290100973, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007495147001664794 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.15813941094305575, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0034339025470937284 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04337652772461485, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009940986270453964 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.057943890660243344, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011331942696761865 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.31252933987652376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004418336624345426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.09146925689558233, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014605690131820356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.059802685455499036, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011813042691715043 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3231337567814217, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004620411779037274 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.09443075155430865, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001535767562485951 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..745c5800090029b2f47a3ac33e5ec6c315e1e1a7 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1498248405216555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018423908624139891 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25526647462429297, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002588155452951415 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1751592790344999, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017874794849447823 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02866254588676867, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007271444111163778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05162759513352221, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0014037908842865225 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.033925070200158246, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008240877531204439 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11555678082540916, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012586001477032337 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20559491764149926, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002138172468666332 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13711284339871613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001264815697203119 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.13771072102711007, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016874302363856556 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23587046561796698, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024016834853047104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.16124043802649227, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016384576957041044 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.484894171357125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04279862989387049 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..b7a84b8a2a873fe8c04e12b1aec3258df721efab --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19820460200268925, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002420333361877177 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3126809375051542, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002841672003398339 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2199694812178742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002010819735875878 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05016728772414271, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001264223371544086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07815311881303431, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017034183791022172 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.053937307211284244, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001075193073237261 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13997435500321276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017712606376717578 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.22624387436643148, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022018166295221108 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15551270440786846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013566776773220371 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18565508806059028, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002281622879542436 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.29360401456830726, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026791632190064567 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20618836045677164, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018846829640657573 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.6577106650236018, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07159113573701131 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c38ff4ca4d258067dfd65107bafc1147d05dbee9 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2156784752655553, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002732677109431224 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.31237685950514665, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027963244796416183 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22435918885551207, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018967794998520616 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05738474692529928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015267876059967937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07970531997370003, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016700905255101184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05625290668830642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010633219356063186 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.15451467159603913, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002139183021303883 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.225376744790094, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021771714204925977 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15917844337997755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013165721234956872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.20314308377906012, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002612094861988029 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.29423268640214134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002642162937712342 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.2111183297537402, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017924385498513864 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.8678009037418817, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037508767497823454 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a371d5c749d70a7ea892cf7925f0e89fffd4e139 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.18405392984121038, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0029940021228191355 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25514944671406053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033459375347498267 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.18429548803283216, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002297153333126352 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04819840246533481, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015092856540081493 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06631560044043883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016628335172097389 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04688317854067561, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001094346737838606 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13375051095662321, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002380892659084607 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1853146721328658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0025854840001956502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13173075629033756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016475788902520988 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1733338698061136, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028479993519003807 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2401891556702282, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003169734302090452 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1733778342320142, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021681294491408274 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.9935371246792863, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08389052572374653 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..6c8569f6bcc8276eafbdabdfbe150d5bfc6bfb16 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.056788240211910736, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022506997974891542 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08339625174295712, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029290040087097207 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05659885761358278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001930223829324733 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.014380667558554284, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000977357379524512 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02101316087357165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011619283931715888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.013810868807903593, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006930003786904605 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04257610439946902, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017464834049321358 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.06302882516092573, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022681193452949904 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.042007409754473535, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014258890197756744 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05318358596123625, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021166691321526723 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07838773551352705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002768443855799422 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05299986098950862, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018068900698462816 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.5461443377554994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.044799916123802616 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..86de4c906a76b5963e84d7085c0f330c0b202b30 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.008911405879685974, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009422475593785767 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.014063917251247313, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001396048114694196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009231042835921754, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008787834704319035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0023383395560896076, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003211681514690954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.004206929070053043, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0006053899081506379 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0024939038248004536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00031409966961336354 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.006838517996278174, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007309899843193323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010882093390231358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0011052580830821516 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.007056657402327181, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006761968053255695 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.008357520892672053, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008835066902880334 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.013172143042073951, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0013147147386398047 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008623638497104687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008181729546128778 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.0112557087204399e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.4150938941127893e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..47d2f9752e9e467a317becd201a91cdd5183a550 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.3138256134465956, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.042478650177872466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.09054875368611116, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001766981005220654 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.09899686345340121, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0016339321555193618 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.08626819478230045, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0014217358109946006 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.004635097118571576, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0004584060579508488 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.006688039413188141, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0004649040639428354 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.004707141554710639, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00037478881160494376 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.08868499326830791, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0016895151305050993 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.09761537714635501, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0015968584648120006 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.08482739955206746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001374786595816378 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.0697631127415905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014027633376290944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.07822809105458635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.001337355132253774 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.06674662648443055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011005680176848465 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9255dd8453f0c24fe4bd28210cdf2203f85fa34c --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.380638446426456, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09557609009378211 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3328975043204133, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003509121285453912 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.45270988550034613, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.003062231284181297 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3400267632274026, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020403696620665606 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1499329665878703, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002263691832530101 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1983614972096706, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020825028564397568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1486663277769484, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015136508138340347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2592088269282877, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002698465205645672 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.36228982914822633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0026355915533943 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2676344730573564, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001562435233832024 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.27037933017473875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003062674460499446 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3663561677071641, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027976045204321166 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2753891028069307, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019025285983907253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9ac97505b9da1dc4488e5b581ed55918787306fa --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.3634920937473884, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08616925114576995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2954297857860177, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029411840632251563 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48120558575759376, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002844478586577598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.33740317942041553, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019381935089587043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.13374093712808155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018836984171556573 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2183048706264126, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002121311703914965 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1507673483604289, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014399337503822303 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.23342217539614568, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002214559299952568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3907575640409568, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002505134070265779 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.26968155815162015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014592699636009493 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.23886641857979662, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0025627121020893886 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3892155963722792, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002698506480779706 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.272599814561145, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00182733683050086 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..b1c8ba5f86476342e579708973e9e62bcb50656e --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.870066226903575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09429297050700895 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2995136417218204, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002877943933866991 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4870699752407154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028159974008536274 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3444162792721549, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019646659016549933 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.13989458331193982, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019259857294848428 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2283186786335389, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002178927464111703 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.158839720125521, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014985646156960436 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.24052403808712888, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0022515901969111986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.4008413679522952, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0025096595704963047 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2794048970004934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015380955393133962 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.24501488471812358, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002592614810126833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.39799000178268285, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002715969584528833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2812645652371936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00189815704910076 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..df03d22a9206dcbbcf096e60a2ba337bea2e6a34 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.017381678234455, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09166907850331675 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.30143650107181685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0029314795240309452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4898031820273571, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027687333603014837 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3465967840398284, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019574393578079014 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.14062404051839908, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019112056232477572 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2300710505484132, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021943432548559823 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.16013598883167798, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015096765844560063 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2422982418880056, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0022053611865584053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.4046980533581673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024846126430562907 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2820802687702902, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015095048991938657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.2479193537232055, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0025849971540480303 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40376209522673123, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002721581361466135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.28522597669249417, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018960405914839836 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..39f5e9e57ef31a184660257cd89d7427bf4e4cb3 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.260211722864896, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12365349150703955 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.3111425574072635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002912308644972262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.486978690036314, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002680144522191287 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3539789382700311, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002014807799247425 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.14647797107879432, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001938088224804308 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2296020100709576, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021177366913279795 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.1648812739511937, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015410229627803459 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.24849239666244133, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002231342628042193 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.39930639747976754, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002405155865675547 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2855363745230388, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0015231421878303527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.25960917213172013, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0026365144213955465 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4057201628355578, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002662489705069069 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.29494158509832297, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019724923487253014 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_0.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..cdd9c43f042165eb8a6338fc60ff9e3845d15905 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.17021358510341336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024373039382226077 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3520563857681371, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0043918973093069365 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.21991779087563262, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025921031610263 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03871189783801819, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013940343459980615 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0841631095984103, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002861696063072104 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.050742967235947956, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016968813344534614 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.12632426569550884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001895537050144316 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.262981285619352, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034911247032713426 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1634042310463717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002018854408540066 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.13144702322249674, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001966787320258499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2761271441560725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0039020054143034 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.17070284614092165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021883919787213176 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.0719375915266327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.09381739892136316 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_1.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..9d9cb00702ae94bae4021984ed75ab3486298e52 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13144344568204896, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018269600623641938 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.32351107047900046, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0042227591238073415 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18467573869385082, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024560883653668973 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.027812878457341133, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009822890456735115 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07174244190002714, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002623646025136939 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03956915695649403, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0013969113381600835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10177712280555466, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013642070339291124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2530115289147634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003361296390437375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1433445973017209, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018589035220238875 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10392255150260828, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014996096643151138 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.25806957261490177, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003642123743587609 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1463591812261117, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020474760089749054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5179351179741758, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05443901746520919 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_2.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..ce1b0073c59c10320afd6e1532e284314b96d557 --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1310122719121047, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001774463078853173 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.32197642898756434, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004128514222989525 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18395936513372504, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0023841684164883858 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.028592074721851848, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010146723977319822 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07420088979991933, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027242327105131063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.040730679478674064, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014445249170945054 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10506688444375611, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0013615120559406814 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2609222630071341, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034251450207401918 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14791071749656537, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018605423267814124 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1039952495955331, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00147131239785375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2579710996210495, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036391736637925577 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1463682832676705, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020126152822738364 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5610996318449655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.061361742735683046 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_3.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7efd5582078c7748b57c94b6d9fa278523c6d7eb --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1284212760059797, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002041598803417733 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3028097339054088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004615535937102858 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.17637720631581139, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00264797834039018 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.026902495307978534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010371639486970465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.06717388422150615, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027080652362817086 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.03767895922224648, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001448043764579111 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1032652428870357, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001609764469770287 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.24490235352225267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003766948264946798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14193633867939442, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002090431283179229 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10103180270559059, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016751299955245328 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2403006525007405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003943762632099877 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.13894288832668733, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002188039489816412 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.5762937299614814, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07362827411239845 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_4.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..c114cf2f4c0a1f6f725884bccbeda3474c2c84dd --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.041468733586705914, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025971833623889785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.07627594081632273, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004409224115860876 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.048911634240246346, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002765697903406571 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.009048514108124975, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012181947613622947 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.01699435587690579, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.001625353914816647 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01035393012550112, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009785561170831303 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03361424158025485, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002192257614857122 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06101985571016838, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035062484157730514 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.03907075670579812, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002182547417010533 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03401363654888806, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0022524026754152373 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0612418372651088, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0035630181436990096 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.039302040446406394, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022230245548452298 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.7513821615574038, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07905589981346285 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_5.json b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f46178bd94ec16c07fe21cb67f25ec770af1053c --- /dev/null +++ b/4b284b17bc4/evaluation/generation/slim.4b284b17bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.003214034101498322, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009221332972255172 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002755372289086832, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007486991780011711 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002904097352119159, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008025657506213006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004553870087049595, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0002660277152615564 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0002936371804296332, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00017278177414040624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0003555930988203656, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00020825306662602857 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0023980769070289483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006896763285113808 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0020343184807463843, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005525851742486296 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0021617867610418057, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000600765218788033 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0027490625210994384, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008051771911473355 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0023634542664537216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006464968490936459 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0024822527344372726, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006926475370230472 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 5.9664964945316196e-36, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.37663460549604e-30 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b17bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_0.json b/4b284b21bc4/evaluation/4b284b21bc4_0.json new file mode 100644 index 0000000000000000000000000000000000000000..5aac52607ef41dfad6ed0068fdd89edf603e0a02 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811485 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.0149550879186536 + }, + "anli_r3": { + "acc": 0.355, + "acc_stderr": 0.013819249004047296 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.4347442680776014 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283 + }, + "hellaswag": { + "acc": 0.4841665006970723, + "acc_stderr": 0.004987278910505115, + "acc_norm": 0.6352320254929297, + "acc_norm_stderr": 0.004803812631994966 + }, + "rte": { + "acc": 0.5306859205776173, + "acc_stderr": 0.03003973059219781 + }, + "winogrande": { + "acc": 0.5990528808208366, + "acc_stderr": 0.013773974554948033 + }, + "storycloze_2016": { + "acc": 0.7151256012827365, + "acc_stderr": 0.010437513986611718 + }, + "boolq": { + "acc": 0.5669724770642202, + "acc_stderr": 0.008666251305518059 + }, + "arc_easy": { + "acc": 0.617003367003367, + "acc_stderr": 0.009974920384536469, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.010215708295494117 + }, + "arc_challenge": { + "acc": 0.28668941979522183, + "acc_stderr": 0.013214986329274757, + "acc_norm": 0.30631399317406144, + "acc_norm_stderr": 0.013470584417276513 + }, + "sciq": { + "acc": 0.845, + "acc_stderr": 0.011450157470799475, + "acc_norm": 0.757, + "acc_norm_stderr": 0.013569640199177458 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.00999437126910438, + "acc_norm": 0.7676822633297062, + "acc_norm_stderr": 0.009853201384168243 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..5aac52607ef41dfad6ed0068fdd89edf603e0a02 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811485 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.0149550879186536 + }, + "anli_r3": { + "acc": 0.355, + "acc_stderr": 0.013819249004047296 + }, + "cb": { + "acc": 0.48214285714285715, + "acc_stderr": 0.0673769750864465, + "f1": 0.4347442680776014 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283 + }, + "hellaswag": { + "acc": 0.4841665006970723, + "acc_stderr": 0.004987278910505115, + "acc_norm": 0.6352320254929297, + "acc_norm_stderr": 0.004803812631994966 + }, + "rte": { + "acc": 0.5306859205776173, + "acc_stderr": 0.03003973059219781 + }, + "winogrande": { + "acc": 0.5990528808208366, + "acc_stderr": 0.013773974554948033 + }, + "storycloze_2016": { + "acc": 0.7151256012827365, + "acc_stderr": 0.010437513986611718 + }, + "boolq": { + "acc": 0.5669724770642202, + "acc_stderr": 0.008666251305518059 + }, + "arc_easy": { + "acc": 0.617003367003367, + "acc_stderr": 0.009974920384536469, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.010215708295494117 + }, + "arc_challenge": { + "acc": 0.28668941979522183, + "acc_stderr": 0.013214986329274757, + "acc_norm": 0.30631399317406144, + "acc_norm_stderr": 0.013470584417276513 + }, + "sciq": { + "acc": 0.845, + "acc_stderr": 0.011450157470799475, + "acc_norm": 0.757, + "acc_norm_stderr": 0.013569640199177458 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.00999437126910438, + "acc_norm": 0.7676822633297062, + "acc_norm_stderr": 0.009853201384168243 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_1.json b/4b284b21bc4/evaluation/4b284b21bc4_1.json new file mode 100644 index 0000000000000000000000000000000000000000..8e72f1ec94f0cbd37c4dd210b0ac569424148625 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.336, + "acc_stderr": 0.01494414023379502 + }, + "anli_r2": { + "acc": 0.315, + "acc_stderr": 0.014696631960792506 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.0136804957257678 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.38181818181818183 + }, + "copa": { + "acc": 0.74, + "acc_stderr": 0.04408440022768077 + }, + "hellaswag": { + "acc": 0.48137821151165106, + "acc_stderr": 0.004986319587524962, + "acc_norm": 0.6344353714399522, + "acc_norm_stderr": 0.004806039039008954 + }, + "rte": { + "acc": 0.5451263537906137, + "acc_stderr": 0.029973636495415252 + }, + "winogrande": { + "acc": 0.5974743488555643, + "acc_stderr": 0.013782866831703048 + }, + "storycloze_2016": { + "acc": 0.7044361304115446, + "acc_stderr": 0.01055177883937378 + }, + "boolq": { + "acc": 0.5669724770642202, + "acc_stderr": 0.008666251305518059 + }, + "arc_easy": { + "acc": 0.6220538720538721, + "acc_stderr": 0.009949405744045452, + "acc_norm": 0.5787037037037037, + "acc_norm_stderr": 0.010131882498193127 + }, + "arc_challenge": { + "acc": 0.29266211604095566, + "acc_stderr": 0.01329591610361942, + "acc_norm": 0.32849829351535836, + "acc_norm_stderr": 0.013724978465537357 + }, + "sciq": { + "acc": 0.891, + "acc_stderr": 0.00985982840703719, + "acc_norm": 0.871, + "acc_norm_stderr": 0.010605256784796579 + }, + "piqa": { + "acc": 0.7551686615886833, + "acc_stderr": 0.010032309105568788, + "acc_norm": 0.764961915125136, + "acc_norm_stderr": 0.009893146688805308 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..8e72f1ec94f0cbd37c4dd210b0ac569424148625 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.336, + "acc_stderr": 0.01494414023379502 + }, + "anli_r2": { + "acc": 0.315, + "acc_stderr": 0.014696631960792506 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.0136804957257678 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.38181818181818183 + }, + "copa": { + "acc": 0.74, + "acc_stderr": 0.04408440022768077 + }, + "hellaswag": { + "acc": 0.48137821151165106, + "acc_stderr": 0.004986319587524962, + "acc_norm": 0.6344353714399522, + "acc_norm_stderr": 0.004806039039008954 + }, + "rte": { + "acc": 0.5451263537906137, + "acc_stderr": 0.029973636495415252 + }, + "winogrande": { + "acc": 0.5974743488555643, + "acc_stderr": 0.013782866831703048 + }, + "storycloze_2016": { + "acc": 0.7044361304115446, + "acc_stderr": 0.01055177883937378 + }, + "boolq": { + "acc": 0.5669724770642202, + "acc_stderr": 0.008666251305518059 + }, + "arc_easy": { + "acc": 0.6220538720538721, + "acc_stderr": 0.009949405744045452, + "acc_norm": 0.5787037037037037, + "acc_norm_stderr": 0.010131882498193127 + }, + "arc_challenge": { + "acc": 0.29266211604095566, + "acc_stderr": 0.01329591610361942, + "acc_norm": 0.32849829351535836, + "acc_norm_stderr": 0.013724978465537357 + }, + "sciq": { + "acc": 0.891, + "acc_stderr": 0.00985982840703719, + "acc_norm": 0.871, + "acc_norm_stderr": 0.010605256784796579 + }, + "piqa": { + "acc": 0.7551686615886833, + "acc_stderr": 0.010032309105568788, + "acc_norm": 0.764961915125136, + "acc_norm_stderr": 0.009893146688805308 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_2.json b/4b284b21bc4/evaluation/4b284b21bc4_2.json new file mode 100644 index 0000000000000000000000000000000000000000..eb7e2dbe363df0445d27b1f90445c4562a1b6234 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.327, + "acc_stderr": 0.014842213153411247 + }, + "anli_r2": { + "acc": 0.333, + "acc_stderr": 0.01491084616422986 + }, + "anli_r3": { + "acc": 0.3408333333333333, + "acc_stderr": 0.01368860079329693 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.3829365079365079 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262 + }, + "hellaswag": { + "acc": 0.48048197570205137, + "acc_stderr": 0.00498597821493792, + "acc_norm": 0.6397132045409281, + "acc_norm_stderr": 0.004791024004587989 + }, + "rte": { + "acc": 0.5090252707581228, + "acc_stderr": 0.030091559826331334 + }, + "winogrande": { + "acc": 0.6053670086819258, + "acc_stderr": 0.013736915172371883 + }, + "storycloze_2016": { + "acc": 0.7161945483698557, + "acc_stderr": 0.01042569627973092 + }, + "boolq": { + "acc": 0.5920489296636086, + "acc_stderr": 0.008595583792654892 + }, + "arc_easy": { + "acc": 0.622895622895623, + "acc_stderr": 0.009945041946366499, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.010044662374653398 + }, + "arc_challenge": { + "acc": 0.295221843003413, + "acc_stderr": 0.013329750293382318, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012129 + }, + "sciq": { + "acc": 0.903, + "acc_stderr": 0.009363689373248092, + "acc_norm": 0.882, + "acc_norm_stderr": 0.010206869264381791 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.009994371269104376, + "acc_norm": 0.7682263329706203, + "acc_norm_stderr": 0.009845143772794043 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..eb7e2dbe363df0445d27b1f90445c4562a1b6234 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.327, + "acc_stderr": 0.014842213153411247 + }, + "anli_r2": { + "acc": 0.333, + "acc_stderr": 0.01491084616422986 + }, + "anli_r3": { + "acc": 0.3408333333333333, + "acc_stderr": 0.01368860079329693 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.3829365079365079 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262 + }, + "hellaswag": { + "acc": 0.48048197570205137, + "acc_stderr": 0.00498597821493792, + "acc_norm": 0.6397132045409281, + "acc_norm_stderr": 0.004791024004587989 + }, + "rte": { + "acc": 0.5090252707581228, + "acc_stderr": 0.030091559826331334 + }, + "winogrande": { + "acc": 0.6053670086819258, + "acc_stderr": 0.013736915172371883 + }, + "storycloze_2016": { + "acc": 0.7161945483698557, + "acc_stderr": 0.01042569627973092 + }, + "boolq": { + "acc": 0.5920489296636086, + "acc_stderr": 0.008595583792654892 + }, + "arc_easy": { + "acc": 0.622895622895623, + "acc_stderr": 0.009945041946366499, + "acc_norm": 0.6018518518518519, + "acc_norm_stderr": 0.010044662374653398 + }, + "arc_challenge": { + "acc": 0.295221843003413, + "acc_stderr": 0.013329750293382318, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012129 + }, + "sciq": { + "acc": 0.903, + "acc_stderr": 0.009363689373248092, + "acc_norm": 0.882, + "acc_norm_stderr": 0.010206869264381791 + }, + "piqa": { + "acc": 0.7578890097932536, + "acc_stderr": 0.009994371269104376, + "acc_norm": 0.7682263329706203, + "acc_norm_stderr": 0.009845143772794043 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_3.json b/4b284b21bc4/evaluation/4b284b21bc4_3.json new file mode 100644 index 0000000000000000000000000000000000000000..c475d11569c652fd452aac5f851bd783c9fe644a --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.338, + "acc_stderr": 0.014965960710224496 + }, + "anli_r2": { + "acc": 0.345, + "acc_stderr": 0.015039986742055238 + }, + "anli_r3": { + "acc": 0.3566666666666667, + "acc_stderr": 0.013833742805050717 + }, + "cb": { + "acc": 0.6071428571428571, + "acc_stderr": 0.0658538889806635, + "f1": 0.5367003367003368 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4826727743477395, + "acc_stderr": 0.004986784319771787, + "acc_norm": 0.6368253335988847, + "acc_norm_stderr": 0.004799317209902001 + }, + "rte": { + "acc": 0.5631768953068592, + "acc_stderr": 0.029855247390314945 + }, + "winogrande": { + "acc": 0.6037884767166535, + "acc_stderr": 0.013746404157154949 + }, + "storycloze_2016": { + "acc": 0.7204703367183325, + "acc_stderr": 0.01037770209970486 + }, + "boolq": { + "acc": 0.5923547400611621, + "acc_stderr": 0.008594580270731619 + }, + "arc_easy": { + "acc": 0.627104377104377, + "acc_stderr": 0.009922743197129257, + "acc_norm": 0.609006734006734, + "acc_norm_stderr": 0.010012992232540631 + }, + "arc_challenge": { + "acc": 0.29436860068259385, + "acc_stderr": 0.013318528460539429, + "acc_norm": 0.3319112627986348, + "acc_norm_stderr": 0.01376098820088054 + }, + "sciq": { + "acc": 0.913, + "acc_stderr": 0.0089168666307459, + "acc_norm": 0.897, + "acc_norm_stderr": 0.009616833339695798 + }, + "piqa": { + "acc": 0.7589771490750816, + "acc_stderr": 0.009979042717267314, + "acc_norm": 0.7742110990206746, + "acc_norm_stderr": 0.009754980670917311 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..c475d11569c652fd452aac5f851bd783c9fe644a --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.338, + "acc_stderr": 0.014965960710224496 + }, + "anli_r2": { + "acc": 0.345, + "acc_stderr": 0.015039986742055238 + }, + "anli_r3": { + "acc": 0.3566666666666667, + "acc_stderr": 0.013833742805050717 + }, + "cb": { + "acc": 0.6071428571428571, + "acc_stderr": 0.0658538889806635, + "f1": 0.5367003367003368 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4826727743477395, + "acc_stderr": 0.004986784319771787, + "acc_norm": 0.6368253335988847, + "acc_norm_stderr": 0.004799317209902001 + }, + "rte": { + "acc": 0.5631768953068592, + "acc_stderr": 0.029855247390314945 + }, + "winogrande": { + "acc": 0.6037884767166535, + "acc_stderr": 0.013746404157154949 + }, + "storycloze_2016": { + "acc": 0.7204703367183325, + "acc_stderr": 0.01037770209970486 + }, + "boolq": { + "acc": 0.5923547400611621, + "acc_stderr": 0.008594580270731619 + }, + "arc_easy": { + "acc": 0.627104377104377, + "acc_stderr": 0.009922743197129257, + "acc_norm": 0.609006734006734, + "acc_norm_stderr": 0.010012992232540631 + }, + "arc_challenge": { + "acc": 0.29436860068259385, + "acc_stderr": 0.013318528460539429, + "acc_norm": 0.3319112627986348, + "acc_norm_stderr": 0.01376098820088054 + }, + "sciq": { + "acc": 0.913, + "acc_stderr": 0.0089168666307459, + "acc_norm": 0.897, + "acc_norm_stderr": 0.009616833339695798 + }, + "piqa": { + "acc": 0.7589771490750816, + "acc_stderr": 0.009979042717267314, + "acc_norm": 0.7742110990206746, + "acc_norm_stderr": 0.009754980670917311 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_4.json b/4b284b21bc4/evaluation/4b284b21bc4_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9ddb1c138667efb85173f7c157562aeef6d68 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.343, + "acc_stderr": 0.015019206922356951 + }, + "anli_r2": { + "acc": 0.346, + "acc_stderr": 0.01505026612756445 + }, + "anli_r3": { + "acc": 0.36083333333333334, + "acc_stderr": 0.01386918025244486 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942395, + "f1": 0.4583333333333333 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.48157737502489545, + "acc_stderr": 0.0049863932662691625, + "acc_norm": 0.6417048396733719, + "acc_norm_stderr": 0.00478519504988916 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.030009848912529113 + }, + "winogrande": { + "acc": 0.6085240726124704, + "acc_stderr": 0.01371748707129085 + }, + "storycloze_2016": { + "acc": 0.7338321753073223, + "acc_stderr": 0.010220104800551206 + }, + "boolq": { + "acc": 0.6119266055045871, + "acc_stderr": 0.00852313058476084 + }, + "arc_easy": { + "acc": 0.6283670033670034, + "acc_stderr": 0.00991589712365879, + "acc_norm": 0.6153198653198653, + "acc_norm_stderr": 0.009983171707008997 + }, + "arc_challenge": { + "acc": 0.2960750853242321, + "acc_stderr": 0.013340916085246271, + "acc_norm": 0.3242320819112628, + "acc_norm_stderr": 0.013678810399518819 + }, + "sciq": { + "acc": 0.923, + "acc_stderr": 0.008434580140240648, + "acc_norm": 0.912, + "acc_norm_stderr": 0.008963053962592074 + }, + "piqa": { + "acc": 0.7595212187159956, + "acc_stderr": 0.009971345364651078, + "acc_norm": 0.7676822633297062, + "acc_norm_stderr": 0.009853201384168243 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..a3b9ddb1c138667efb85173f7c157562aeef6d68 --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.343, + "acc_stderr": 0.015019206922356951 + }, + "anli_r2": { + "acc": 0.346, + "acc_stderr": 0.01505026612756445 + }, + "anli_r3": { + "acc": 0.36083333333333334, + "acc_stderr": 0.01386918025244486 + }, + "cb": { + "acc": 0.5535714285714286, + "acc_stderr": 0.06703189227942395, + "f1": 0.4583333333333333 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.48157737502489545, + "acc_stderr": 0.0049863932662691625, + "acc_norm": 0.6417048396733719, + "acc_norm_stderr": 0.00478519504988916 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.030009848912529113 + }, + "winogrande": { + "acc": 0.6085240726124704, + "acc_stderr": 0.01371748707129085 + }, + "storycloze_2016": { + "acc": 0.7338321753073223, + "acc_stderr": 0.010220104800551206 + }, + "boolq": { + "acc": 0.6119266055045871, + "acc_stderr": 0.00852313058476084 + }, + "arc_easy": { + "acc": 0.6283670033670034, + "acc_stderr": 0.00991589712365879, + "acc_norm": 0.6153198653198653, + "acc_norm_stderr": 0.009983171707008997 + }, + "arc_challenge": { + "acc": 0.2960750853242321, + "acc_stderr": 0.013340916085246271, + "acc_norm": 0.3242320819112628, + "acc_norm_stderr": 0.013678810399518819 + }, + "sciq": { + "acc": 0.923, + "acc_stderr": 0.008434580140240648, + "acc_norm": 0.912, + "acc_norm_stderr": 0.008963053962592074 + }, + "piqa": { + "acc": 0.7595212187159956, + "acc_stderr": 0.009971345364651078, + "acc_norm": 0.7676822633297062, + "acc_norm_stderr": 0.009853201384168243 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_5.json b/4b284b21bc4/evaluation/4b284b21bc4_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2114b78288086a92003220343f1d693f0907ab --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_5.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.351, + "acc_stderr": 0.015100563798316405 + }, + "anli_r2": { + "acc": 0.345, + "acc_stderr": 0.015039986742055237 + }, + "anli_r3": { + "acc": 0.345, + "acc_stderr": 0.013728421539454878 + }, + "cb": { + "acc": 0.5714285714285714, + "acc_stderr": 0.06672848092813058, + "f1": 0.37671957671957673 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261 + }, + "hellaswag": { + "acc": 0.4827723561043617, + "acc_stderr": 0.004986818680313444, + "acc_norm": 0.6446922923720374, + "acc_norm_stderr": 0.004776283203468094 + }, + "rte": { + "acc": 0.5776173285198556, + "acc_stderr": 0.02973162264649588 + }, + "winogrande": { + "acc": 0.595895816890292, + "acc_stderr": 0.013791610664670845 + }, + "storycloze_2016": { + "acc": 0.7252805986103688, + "acc_stderr": 0.010322309878339507 + }, + "boolq": { + "acc": 0.6146788990825688, + "acc_stderr": 0.008511930879680652 + }, + "arc_easy": { + "acc": 0.6300505050505051, + "acc_stderr": 0.009906656266021155, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.01000324833531377 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json b/4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..ed2114b78288086a92003220343f1d693f0907ab --- /dev/null +++ b/4b284b21bc4/evaluation/4b284b21bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.351, + "acc_stderr": 0.015100563798316405 + }, + "anli_r2": { + "acc": 0.345, + "acc_stderr": 0.015039986742055237 + }, + "anli_r3": { + "acc": 0.345, + "acc_stderr": 0.013728421539454878 + }, + "cb": { + "acc": 0.5714285714285714, + "acc_stderr": 0.06672848092813058, + "f1": 0.37671957671957673 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932261 + }, + "hellaswag": { + "acc": 0.4827723561043617, + "acc_stderr": 0.004986818680313444, + "acc_norm": 0.6446922923720374, + "acc_norm_stderr": 0.004776283203468094 + }, + "rte": { + "acc": 0.5776173285198556, + "acc_stderr": 0.02973162264649588 + }, + "winogrande": { + "acc": 0.595895816890292, + "acc_stderr": 0.013791610664670845 + }, + "storycloze_2016": { + "acc": 0.7252805986103688, + "acc_stderr": 0.010322309878339507 + }, + "boolq": { + "acc": 0.6146788990825688, + "acc_stderr": 0.008511930879680652 + }, + "arc_easy": { + "acc": 0.6300505050505051, + "acc_stderr": 0.009906656266021155, + "acc_norm": 0.6111111111111112, + "acc_norm_stderr": 0.01000324833531377 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6d8f8715fc2dbec93287f9eb67c5b7802ce7b051 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.3423886062648571, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03277534172219839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07116912691303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015925050224480028 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.2998523601292701, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004615423133559915 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1078155721409226, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020097112571708245 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03290155614985229, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009635800950074162 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1430151870039024, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003133828558045969 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.04998894903569846, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001254978454654862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06817638471397719, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001440134234671522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2915042735714293, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004516752469805975 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10384529013890176, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018689379304153127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06758374630355668, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014755310584898722 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2855613282697172, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004322222242728911 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10254842199703165, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018748895711891628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..125091dbc98d6d2f6110709429c772384c7f2184 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.43549317288896894, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.029438186163177924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07179524471867899, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013752072779383184 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.35855038856048876, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005049635846993475 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11204264924342298, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018816879797699437 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.032801419147362856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008197872630256377 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1723852958712864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0035877516735487143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.051345397484036256, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011584347174914676 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06711558592890904, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.00122163800184802 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.33387823961526397, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004537882130758048 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10481665765175784, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016825296642133658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06830552553162243, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001292240550330291 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.33957294457499176, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004661961230072316 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10653343348961458, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017665097203702323 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..51847f3dc7eb3f978a329fc87cadd88b4815a3f5 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4731158648079456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.019398006063203924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07373712610103833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013232763617533117 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3838400605745808, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004971353366559517 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11565039220118124, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017909510986326618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03418002818557043, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007999127016124942 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18966804432678097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037070132910163134 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.053828506115298144, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011299593484305154 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0688866789263329, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001197716696827759 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.35416301817745277, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004408869173242153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10799852330681028, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016342464145243179 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07036220911566307, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012542262936579098 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3642675981143951, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004634298128673839 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11029915615802689, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017015090669507955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..93d700915a2dced4dfdeec65a8cf62d5649cb51b --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.511033492138013, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.016836817368392938 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0711394556954671, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012159991412829965 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3852018338546001, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005067790324136812 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11234455309812195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001658068318944836 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03260081169248259, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007319564987104789 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.18947442868920766, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037625536404537635 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.051724489676439236, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010463045011873814 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06636573472304302, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011254814997386617 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3527863813991901, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004403124502538447 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10462160106353212, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015374663592284314 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.0678291021786637, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011705423632374366 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.36350690371493805, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004671323997080324 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10692341362108594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001592257988435529 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fe9221ebdf3e9d46b83770f06182abfbecdbbd51 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5255384435057461, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03418580070894041 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07289393265058683, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001221203882596679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.39210555607501246, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004907654599821545 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11520145885524227, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016544612830049232 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03333523314795858, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007290358506575526 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.19457648128168623, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003616303249775819 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.052942763106877684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010297898091120672 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06752250125401707, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011086146136256522 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.35896707400416245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004279012704408649 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10655328646588215, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014999277772535491 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06949836967046885, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011669918564828742 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3713609435685992, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004540169352829801 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10971668045319283, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001577124609883823 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..53e9ff66a968d2321e5e4eaed57d9949319777d3 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5823615010118224, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.037398869921054644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07346269739111895, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012187681049390564 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4042470714837263, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005065518163952181 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11639260450950206, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016495247557604747 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.033940599163428696, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007391477550345831 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20304234761076909, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003764961452207978 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.054089458597439195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010444166935940533 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06792933252591714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011100797550413972 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3687266029366595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004388331338140494 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10741878654019481, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015017472778460627 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06991029736225668, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001173008019162227 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.38168151833425595, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00467124723267976 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11057542829413115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015808583588389114 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a04ea560bcc0474d88bfdfa709e04e0292bb2c70 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.1463073350919793, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001849976731473478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.24687692205659445, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0026245416604448834 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17104966216653122, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018388410541572498 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02868070817665277, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007455100631059865 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.050970339370236116, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0013773850589296726 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03393757157227001, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008395819605445009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1134975332744349, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013046086784248739 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.19882847469371265, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002160779528818444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13431002294362815, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001329252784566218 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1341572569615249, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001680913399139742 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.22735010800139077, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002429214690741554 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1570085078363825, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016723834304551517 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.4552750330972206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05646997136025405 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..4051788c590cc0a7c36d38c9a6a2c792a2a1e236 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19104818364291273, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022212767005973058 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3165545579856358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002818835296854103 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.21883314563862602, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001968035833203396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04737338234449901, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010449765003741809 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.081573363995006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017297723073854117 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05423182118294372, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010465141353409224 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1352656471757331, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015360042939327185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.23168675674466935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022572191515858007 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15600647601501733, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013358035921943341 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.17973117727215357, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020830282164021902 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.29889898054403574, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026791790500151685 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20602296990458846, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018407304080033926 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.807870663958701, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08388649431195123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..46982b7aead4fd0284246be5d8eccca95e63c4c6 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.22091940935835946, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003036544780507811 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.28592540020450335, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028268458929152375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.213319730404769, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001947942927707738 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0625668775391695, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001863481800778566 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07636442527094099, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016705370533154184 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05619324678157442, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011378174233996204 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16524622277086715, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002504977446892061 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21312172953880804, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002230572391304009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15707830961714367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001424704542577375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2088533869488438, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0029209986594073956 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.27031759901723496, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00269091513844755 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20133540483239756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018432855451969799 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0758182963274967, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.07269865886048295 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..99553da8824ba6c5c1679c081d0fd3d2013df41a --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.21388175415191102, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0037807201763198858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.22109179833036308, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00317048229219639 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17690311487463797, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023173541475324253 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06310561248802647, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0021350241076741575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05983363177094545, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015915281660158853 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04775424867054453, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011657814934947282 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.16656647324261953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0032154715784240336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.16788106886771248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0024768275501905303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13377606619329538, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017521905460361976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.2032660883469589, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003645991164048245 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.20988404520976878, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003033569663633178 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.16765421945999662, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002202375233222233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.2676357062273516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.12290136548273946 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..2b2a87b5768349046192aeab42c0361a3700bcd0 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.07217489085431607, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.003030440715398035 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.06719640775054124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0025953918642397556 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05381265634572307, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019615926895963105 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.021665967083708774, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0015098116987345535 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.01860172806438717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011134284906677621 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.014607128479951145, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0007893617726734669 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.057955761496930415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002549676328226009 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05288705994275681, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0020802147270799964 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.041897511040161206, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015198623386063986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.06845672227370601, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0028945029688266562 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.06371644081837961, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024692144864177654 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05091825334557307, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001857333758746474 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.3207302415446247, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03951812995159553 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9a0e2ba9890b3190b5b0186c3ab526346123f263 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.012746310259293975, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001498370110228753 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.01036719903184163, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011041663951146658 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.008600982975105755, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000871586481245996 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0046837043167768724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000851048694914921 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0032041325852383415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00048168285623642387 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0027143726441978717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0004021894165602589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.010658331542732118, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013171585867806077 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.008285275201527513, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008892336234217646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006953217638177468, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0007188116843350712 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.01225616498642844, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014511703007000516 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.009942438567258063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0010638347073576458 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008236010332581721, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008379564793264335 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.2490869778774144e-09, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 4.579875964652809e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..56d0dedcd1ed121b8235ffd1c305aed18ad45525 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 3.3868925634521596, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07415023515490869 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.12781588961197554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016392738386146906 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.26686211195039367, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0030010672113457175 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.16968795558812888, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020520636735140885 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.04754240795799523, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001014281868832704 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.09732440743600192, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002035877095135508 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.06289750165250287, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013193385142581298 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.11843660338175163, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014270824435811313 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.24837983245494402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0026307193382184542 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.15745602394794062, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017868944105710683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.11138415788379845, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014602010171592388 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.23362654109870873, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027185132812130603 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.14802696104042146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018361037263776232 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7d243622640b268a7a53a159b3c4f3b272aa32b0 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 5.656637998489558, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06138640019754124 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.22448431958439818, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0015774519438982097 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.475937695786901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002823854240730373 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.29857293628895076, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017883517178428313 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.09278798103068853, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010182785337770847 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2039364568886483, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021426616098330804 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.12449829406834531, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0012625501065363337 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.18571244490415678, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011675141052374309 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3980265502028155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023515010622215645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2478615356783164, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013407338809586194 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.18244634491896064, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.001437571751990845 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3876575760439037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002699802672015727 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.24278851384210426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016843478489220692 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4da23fa6cc7f14e6378d00a94042bd74683bc66a --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 6.856901680068561, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08124041054587171 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.2401800477102809, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014473576335462571 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.5163816757468545, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027490552702758896 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.32161895837988735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.00168146427395452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.10780300493629737, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010069472486850257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2413921319510364, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002257493837016372 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.14591205568832014, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001294685527449689 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.20140298531801745, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011006514010170898 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.43694912141494435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002368493512084033 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2704562548681343, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013038206676934103 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.19780238487026972, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013643352859564666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.42666941865224894, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027363306527661384 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2651360409168281, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00165326573625161 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..27910da617f14dfdd93a4ab406fc9751b23d0368 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.067980420028392, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07674666618798165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.24119330473410452, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.001436265091489619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.520447176584629, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027841731171524635 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3234918087831591, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016866852215600267 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1098856604345649, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010230282133462645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.24765707981063853, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002344687079258296 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.14908085018598377, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001325863965517803 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2028877822846739, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00110597560698619 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.44168375010036015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024210245845319865 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.2728226991547497, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001319038570084664 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.19995751457276323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013597786817193271 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4331317668864426, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002771288232621466 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2684669151387224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016553181450139245 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..8ba8e90298043de44fe37af04a720f8d4b029a15 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.301331414189049, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07282835065460666 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.24454894011699824, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014458985189821688 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.5282893998840518, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0027239476703194553 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.3281997540286874, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016842914784987548 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.11241096101927037, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010296837842975505 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.25292679532334217, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023135968641909677 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.15250785714191883, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0013289295083122636 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.20493346554385022, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011217276272122712 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.44669665680269593, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023872195322647088 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.27576809325399554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001332292317195587 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.20253099841039804, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013836573195200987 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4389119231936933, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027715255759945875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.2720277049182356, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016797663432208906 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..96df9356b975f0833e6b88a5d7814a52dfd449bd --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 7.391640493190723, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.07696132388022625 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.248561550528162, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0014842009296995168 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.5361907676530253, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026923692664488426 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.33316361794611193, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0016721318654132375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.11498051543992278, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0010486619107346665 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2578870983510302, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022815014259324113 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.15567663749325128, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001316099001704392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2074081838133947, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0011575590389617165 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.45169489347455444, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023905395426844995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.27881193769275936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0013297414200118412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.20657408552804282, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0014132706167399262 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4467577031168348, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027053770792765386 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.27704641367351746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001654188879883823 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_0.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..40dbe2a913453b5856df1ff18db24181985c4017 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14714404834539455, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018923895784899547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.35332808198249066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004277854613117222 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.20448305051370513, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024499517599214865 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03290270141635618, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010989269503369368 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08287973270989689, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002833032880998302 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04639360161894793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015361212037212923 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10929554255441323, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001387362394565063 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26469649902339626, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033616815736178403 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1522143356903949, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018364801701113477 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11672705515751206, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001546854617689576 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.28315180080173324, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0037966298013545237 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1627172387554665, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0020740152357133783 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.8973746751821576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07374608873039461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_1.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..3ef6e2385def93257392d36afcc74e8b301a6168 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1344416388362992, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019127327308320389 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.33053933219336484, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00435936977132835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18885391905418497, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025584792794476653 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.02919021495874484, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001083579043412117 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07395642387380598, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027833547930418757 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.041352212313184845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015256985849036882 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10283551822207533, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014281321188522025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25511273674645885, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0034307162021354593 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1447818747001178, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001931305342910804 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10637101855903888, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015675521812090556 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26365396988763345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003725007857599503 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14973596762001254, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002121716066142779 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.6712614198546765, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08029090892350033 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_2.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..c544bb0aef00fd0b7f0fe2a9de26adaccb203c0a --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14045160525988198, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018675726895835078 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3448073202698317, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004206053313996407 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19729124660372196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024865996635105967 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03250319865239163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001104582027493848 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08285013633430294, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028679116547561338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04614704751240165, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001558612721206191 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1076421486644076, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.00138815532226034 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2669756747819945, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033509483772327935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15159285154298055, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018751369925771642 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11165266260752228, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015420215686911798 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2762348157117063, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036302813408504828 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1571758204423751, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002082270276853359 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.7895958187086474, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.04787428909067152 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_3.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f087b27de9cbef69bead4963663c017989d5a1bc --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1397482550494329, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021614980521941947 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3314206756572503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0048258842534713575 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19247676178691825, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0027783344646224338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.032034815897723355, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011234176352706835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08021224571691461, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00290937059770342 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04492583126972709, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015630404482977962 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10664386827077653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016219261205998713 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25469242016766824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003747468811937111 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14710040508558128, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020844906061187624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11038994072863482, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001775109847716677 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2637144833686383, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004087246305150146 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1523907734490472, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023066662456799336 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.799886255448494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06241605343512123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_4.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fe35a49e7a5c816f87cffb64105d5c095efd9eb8 --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04293551268263482, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025300915700631186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08251909123638941, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0047552295627311435 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.052109895215743926, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029325961974541494 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.008902116582691397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0007882814434287245 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.020355209333257882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0018399757489791825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.01203821663060757, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001058756941479336 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03291018535700006, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001988611171780747 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06245660895956642, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036021201516783473 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.039428572953528446, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002210624192813922 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03518063964656488, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021155739741956746 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0671088243853192, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003920638757312349 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0423468971865807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002398760878941016 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.912241430159217, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1508779538575514 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_5.json b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..88046878533f42e9f3345a467e4923023eebffcb --- /dev/null +++ b/4b284b21bc4/evaluation/generation/slim.4b284b21bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.002739968547331711, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008278012212952025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0022134599272398445, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0006273422111807437 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0023684978392920267, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.000684399718764436 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.000376696230153845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00016337393027254123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00030314025597044463, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00012776302473939982 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0003256051958251534, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00013752393773970521 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0021741866307845633, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006503820290684123 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0018275205470825698, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005304151694392347 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0019272015023736237, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005653230599258551 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002299258100195655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006916877058747434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.0019069310038515171, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005514157386454834 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0020216316744952494, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005921597049542807 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.097815638153428e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 5.018895149426352e-34 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b21bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_0.json b/4b284b28bc4/evaluation/4b284b28bc4_0.json new file mode 100644 index 0000000000000000000000000000000000000000..092d1b898067aa5e148ebd915a430c3d5464c4bd --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.344, + "acc_stderr": 0.015029633724408947 + }, + "anli_r2": { + "acc": 0.321, + "acc_stderr": 0.01477082181793464 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.01375943749887408 + }, + "cb": { + "acc": 0.35714285714285715, + "acc_stderr": 0.06460957383809221, + "f1": 0.1754385964912281 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4792869946225851, + "acc_stderr": 0.004985498055190357, + "acc_norm": 0.6265684126667994, + "acc_norm_stderr": 0.004827266662144035 + }, + "rte": { + "acc": 0.5342960288808665, + "acc_stderr": 0.030025579819366422 + }, + "winogrande": { + "acc": 0.5753749013417522, + "acc_stderr": 0.013891893150264213 + }, + "storycloze_2016": { + "acc": 0.7231427044361304, + "acc_stderr": 0.01034711289027692 + }, + "boolq": { + "acc": 0.5700305810397553, + "acc_stderr": 0.008658853690729254 + }, + "arc_easy": { + "acc": 0.5984848484848485, + "acc_stderr": 0.010058790020755567, + "acc_norm": 0.5395622895622896, + "acc_norm_stderr": 0.01022761638628902 + }, + "arc_challenge": { + "acc": 0.27986348122866894, + "acc_stderr": 0.013119040897725922, + "acc_norm": 0.31143344709897613, + "acc_norm_stderr": 0.013532472099850942 + }, + "sciq": { + "acc": 0.848, + "acc_stderr": 0.011358918303475274, + "acc_norm": 0.769, + "acc_norm_stderr": 0.013334797216936438 + }, + "piqa": { + "acc": 0.7584330794341676, + "acc_stderr": 0.009986718001804467, + "acc_norm": 0.7633297062023939, + "acc_norm_stderr": 0.009916841655042809 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..092d1b898067aa5e148ebd915a430c3d5464c4bd --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_0_lm-eval_global_step80108_2023-01-30-11-26-39_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.344, + "acc_stderr": 0.015029633724408947 + }, + "anli_r2": { + "acc": 0.321, + "acc_stderr": 0.01477082181793464 + }, + "anli_r3": { + "acc": 0.34833333333333333, + "acc_stderr": 0.01375943749887408 + }, + "cb": { + "acc": 0.35714285714285715, + "acc_stderr": 0.06460957383809221, + "f1": 0.1754385964912281 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4792869946225851, + "acc_stderr": 0.004985498055190357, + "acc_norm": 0.6265684126667994, + "acc_norm_stderr": 0.004827266662144035 + }, + "rte": { + "acc": 0.5342960288808665, + "acc_stderr": 0.030025579819366422 + }, + "winogrande": { + "acc": 0.5753749013417522, + "acc_stderr": 0.013891893150264213 + }, + "storycloze_2016": { + "acc": 0.7231427044361304, + "acc_stderr": 0.01034711289027692 + }, + "boolq": { + "acc": 0.5700305810397553, + "acc_stderr": 0.008658853690729254 + }, + "arc_easy": { + "acc": 0.5984848484848485, + "acc_stderr": 0.010058790020755567, + "acc_norm": 0.5395622895622896, + "acc_norm_stderr": 0.01022761638628902 + }, + "arc_challenge": { + "acc": 0.27986348122866894, + "acc_stderr": 0.013119040897725922, + "acc_norm": 0.31143344709897613, + "acc_norm_stderr": 0.013532472099850942 + }, + "sciq": { + "acc": 0.848, + "acc_stderr": 0.011358918303475274, + "acc_norm": 0.769, + "acc_norm_stderr": 0.013334797216936438 + }, + "piqa": { + "acc": 0.7584330794341676, + "acc_stderr": 0.009986718001804467, + "acc_norm": 0.7633297062023939, + "acc_norm_stderr": 0.009916841655042809 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_1.json b/4b284b28bc4/evaluation/4b284b28bc4_1.json new file mode 100644 index 0000000000000000000000000000000000000000..564bd37debe84193e51b636c92f826bee997af45 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.34, + "acc_stderr": 0.014987482264363937 + }, + "anli_r2": { + "acc": 0.321, + "acc_stderr": 0.014770821817934644 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.013680495725767803 + }, + "cb": { + "acc": 0.375, + "acc_stderr": 0.06527912098338669, + "f1": 0.32099491681373216 + }, + "copa": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506 + }, + "hellaswag": { + "acc": 0.48078072097191793, + "acc_stderr": 0.004986093791041653, + "acc_norm": 0.6337382991435969, + "acc_norm_stderr": 0.004807975515446487 + }, + "rte": { + "acc": 0.5740072202166066, + "acc_stderr": 0.029764956741777645 + }, + "winogrande": { + "acc": 0.590370955011839, + "acc_stderr": 0.013821049109655453 + }, + "storycloze_2016": { + "acc": 0.7204703367183325, + "acc_stderr": 0.01037770209970486 + }, + "boolq": { + "acc": 0.5948012232415902, + "acc_stderr": 0.008586427929715515 + }, + "arc_easy": { + "acc": 0.6262626262626263, + "acc_stderr": 0.009927267058259628, + "acc_norm": 0.5917508417508418, + "acc_norm_stderr": 0.01008556619579125 + }, + "arc_challenge": { + "acc": 0.29266211604095566, + "acc_stderr": 0.013295916103619417, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012132 + }, + "sciq": { + "acc": 0.904, + "acc_stderr": 0.009320454434783227, + "acc_norm": 0.885, + "acc_norm_stderr": 0.01009340759490462 + }, + "piqa": { + "acc": 0.7622415669205659, + "acc_stderr": 0.009932525779525489, + "acc_norm": 0.763873775843308, + "acc_norm_stderr": 0.009908965890558218 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..564bd37debe84193e51b636c92f826bee997af45 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_1_lm-eval_global_step80108_2023-01-30-11-26-39_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.34, + "acc_stderr": 0.014987482264363937 + }, + "anli_r2": { + "acc": 0.321, + "acc_stderr": 0.014770821817934644 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.013680495725767803 + }, + "cb": { + "acc": 0.375, + "acc_stderr": 0.06527912098338669, + "f1": 0.32099491681373216 + }, + "copa": { + "acc": 0.77, + "acc_stderr": 0.04229525846816506 + }, + "hellaswag": { + "acc": 0.48078072097191793, + "acc_stderr": 0.004986093791041653, + "acc_norm": 0.6337382991435969, + "acc_norm_stderr": 0.004807975515446487 + }, + "rte": { + "acc": 0.5740072202166066, + "acc_stderr": 0.029764956741777645 + }, + "winogrande": { + "acc": 0.590370955011839, + "acc_stderr": 0.013821049109655453 + }, + "storycloze_2016": { + "acc": 0.7204703367183325, + "acc_stderr": 0.01037770209970486 + }, + "boolq": { + "acc": 0.5948012232415902, + "acc_stderr": 0.008586427929715515 + }, + "arc_easy": { + "acc": 0.6262626262626263, + "acc_stderr": 0.009927267058259628, + "acc_norm": 0.5917508417508418, + "acc_norm_stderr": 0.01008556619579125 + }, + "arc_challenge": { + "acc": 0.29266211604095566, + "acc_stderr": 0.013295916103619417, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.013669421630012132 + }, + "sciq": { + "acc": 0.904, + "acc_stderr": 0.009320454434783227, + "acc_norm": 0.885, + "acc_norm_stderr": 0.01009340759490462 + }, + "piqa": { + "acc": 0.7622415669205659, + "acc_stderr": 0.009932525779525489, + "acc_norm": 0.763873775843308, + "acc_norm_stderr": 0.009908965890558218 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_2.json b/4b284b28bc4/evaluation/4b284b28bc4_2.json new file mode 100644 index 0000000000000000000000000000000000000000..25d172096cee2a6033e16c51c520c23abfe04837 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r2": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r3": { + "acc": 0.3233333333333333, + "acc_stderr": 0.013508372867300217 + }, + "cb": { + "acc": 0.25, + "acc_stderr": 0.058387420812114225, + "f1": 0.22987012987012986 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.47988448516231824, + "acc_stderr": 0.004985741706385727, + "acc_norm": 0.6363274248157738, + "acc_norm_stderr": 0.004800728138792371 + }, + "rte": { + "acc": 0.5631768953068592, + "acc_stderr": 0.02985524739031495 + }, + "winogrande": { + "acc": 0.5824782951854776, + "acc_stderr": 0.013859978264440248 + }, + "storycloze_2016": { + "acc": 0.7177979690005345, + "acc_stderr": 0.010407834479647673 + }, + "boolq": { + "acc": 0.627217125382263, + "acc_stderr": 0.008457255867914694 + }, + "arc_easy": { + "acc": 0.6308922558922558, + "acc_stderr": 0.009901987410242742, + "acc_norm": 0.6123737373737373, + "acc_norm_stderr": 0.009997307914447612 + }, + "arc_challenge": { + "acc": 0.30204778156996587, + "acc_stderr": 0.01341751914471642, + "acc_norm": 0.3216723549488055, + "acc_norm_stderr": 0.013650488084494162 + }, + "sciq": { + "acc": 0.914, + "acc_stderr": 0.008870325962594766, + "acc_norm": 0.883, + "acc_norm_stderr": 0.010169287802713329 + }, + "piqa": { + "acc": 0.7606093579978237, + "acc_stderr": 0.009955884250291681, + "acc_norm": 0.76550598476605, + "acc_norm_stderr": 0.009885203143240543 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..25d172096cee2a6033e16c51c520c23abfe04837 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_2_lm-eval_global_step80108_2023-01-30-11-26-39_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r2": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r3": { + "acc": 0.3233333333333333, + "acc_stderr": 0.013508372867300217 + }, + "cb": { + "acc": 0.25, + "acc_stderr": 0.058387420812114225, + "f1": 0.22987012987012986 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.47988448516231824, + "acc_stderr": 0.004985741706385727, + "acc_norm": 0.6363274248157738, + "acc_norm_stderr": 0.004800728138792371 + }, + "rte": { + "acc": 0.5631768953068592, + "acc_stderr": 0.02985524739031495 + }, + "winogrande": { + "acc": 0.5824782951854776, + "acc_stderr": 0.013859978264440248 + }, + "storycloze_2016": { + "acc": 0.7177979690005345, + "acc_stderr": 0.010407834479647673 + }, + "boolq": { + "acc": 0.627217125382263, + "acc_stderr": 0.008457255867914694 + }, + "arc_easy": { + "acc": 0.6308922558922558, + "acc_stderr": 0.009901987410242742, + "acc_norm": 0.6123737373737373, + "acc_norm_stderr": 0.009997307914447612 + }, + "arc_challenge": { + "acc": 0.30204778156996587, + "acc_stderr": 0.01341751914471642, + "acc_norm": 0.3216723549488055, + "acc_norm_stderr": 0.013650488084494162 + }, + "sciq": { + "acc": 0.914, + "acc_stderr": 0.008870325962594766, + "acc_norm": 0.883, + "acc_norm_stderr": 0.010169287802713329 + }, + "piqa": { + "acc": 0.7606093579978237, + "acc_stderr": 0.009955884250291681, + "acc_norm": 0.76550598476605, + "acc_norm_stderr": 0.009885203143240543 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_3.json b/4b284b28bc4/evaluation/4b284b28bc4_3.json new file mode 100644 index 0000000000000000000000000000000000000000..49db01008785eeb84ede436c087b77082fb99bee --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.325, + "acc_stderr": 0.014818724459095524 + }, + "anli_r2": { + "acc": 0.336, + "acc_stderr": 0.014944140233795021 + }, + "anli_r3": { + "acc": 0.3233333333333333, + "acc_stderr": 0.013508372867300212 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.3565868967138097 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.4790878311093408, + "acc_stderr": 0.004985415250690914, + "acc_norm": 0.634833698466441, + "acc_norm_stderr": 0.004804927608773137 + }, + "rte": { + "acc": 0.6064981949458483, + "acc_stderr": 0.029405839314203194 + }, + "winogrande": { + "acc": 0.585635359116022, + "acc_stderr": 0.013844846232268563 + }, + "storycloze_2016": { + "acc": 0.7295563869588455, + "acc_stderr": 0.010271810373331027 + }, + "boolq": { + "acc": 0.6241590214067279, + "acc_stderr": 0.008471147248160107 + }, + "arc_easy": { + "acc": 0.6372053872053872, + "acc_stderr": 0.009865936757013942, + "acc_norm": 0.6186868686868687, + "acc_norm_stderr": 0.009966542497171021 + }, + "arc_challenge": { + "acc": 0.30119453924914674, + "acc_stderr": 0.013406741767847624, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.01366942163001213 + }, + "sciq": { + "acc": 0.91, + "acc_stderr": 0.00905439020486644, + "acc_norm": 0.897, + "acc_norm_stderr": 0.009616833339695796 + }, + "piqa": { + "acc": 0.7540805223068553, + "acc_stderr": 0.01004733186562519, + "acc_norm": 0.7687704026115343, + "acc_norm_stderr": 0.009837063180625334 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..49db01008785eeb84ede436c087b77082fb99bee --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_3_lm-eval_global_step80108_2023-01-30-11-26-39_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.325, + "acc_stderr": 0.014818724459095524 + }, + "anli_r2": { + "acc": 0.336, + "acc_stderr": 0.014944140233795021 + }, + "anli_r3": { + "acc": 0.3233333333333333, + "acc_stderr": 0.013508372867300212 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.3565868967138097 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.4790878311093408, + "acc_stderr": 0.004985415250690914, + "acc_norm": 0.634833698466441, + "acc_norm_stderr": 0.004804927608773137 + }, + "rte": { + "acc": 0.6064981949458483, + "acc_stderr": 0.029405839314203194 + }, + "winogrande": { + "acc": 0.585635359116022, + "acc_stderr": 0.013844846232268563 + }, + "storycloze_2016": { + "acc": 0.7295563869588455, + "acc_stderr": 0.010271810373331027 + }, + "boolq": { + "acc": 0.6241590214067279, + "acc_stderr": 0.008471147248160107 + }, + "arc_easy": { + "acc": 0.6372053872053872, + "acc_stderr": 0.009865936757013942, + "acc_norm": 0.6186868686868687, + "acc_norm_stderr": 0.009966542497171021 + }, + "arc_challenge": { + "acc": 0.30119453924914674, + "acc_stderr": 0.013406741767847624, + "acc_norm": 0.32337883959044367, + "acc_norm_stderr": 0.01366942163001213 + }, + "sciq": { + "acc": 0.91, + "acc_stderr": 0.00905439020486644, + "acc_norm": 0.897, + "acc_norm_stderr": 0.009616833339695796 + }, + "piqa": { + "acc": 0.7540805223068553, + "acc_stderr": 0.01004733186562519, + "acc_norm": 0.7687704026115343, + "acc_norm_stderr": 0.009837063180625334 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_4.json b/4b284b28bc4/evaluation/4b284b28bc4_4.json new file mode 100644 index 0000000000000000000000000000000000000000..5a0d62eba9e09ae1ee6783fa2e6d5d765560dd65 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.345, + "acc_stderr": 0.015039986742055235 + }, + "anli_r2": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r3": { + "acc": 0.31416666666666665, + "acc_stderr": 0.013405399314984096 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.3647495361781076 + }, + "copa": { + "acc": 0.82, + "acc_stderr": 0.038612291966536955 + }, + "hellaswag": { + "acc": 0.4819757020513842, + "acc_stderr": 0.004986538243846636, + "acc_norm": 0.6387173869747063, + "acc_norm_stderr": 0.004793904922401888 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.5832675611681136, + "acc_stderr": 0.013856250072796322 + }, + "storycloze_2016": { + "acc": 0.7386424371993586, + "acc_stderr": 0.010160471460690485 + }, + "boolq": { + "acc": 0.6275229357798165, + "acc_stderr": 0.008455846866956085 + }, + "arc_easy": { + "acc": 0.6405723905723906, + "acc_stderr": 0.009845958893373766, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.00995373765654204 + }, + "arc_challenge": { + "acc": 0.30204778156996587, + "acc_stderr": 0.01341751914471642, + "acc_norm": 0.32764505119453924, + "acc_norm_stderr": 0.013715847940719344 + }, + "sciq": { + "acc": 0.92, + "acc_stderr": 0.008583336977753653, + "acc_norm": 0.907, + "acc_norm_stderr": 0.009188875634996702 + }, + "piqa": { + "acc": 0.7551686615886833, + "acc_stderr": 0.01003230910556879, + "acc_norm": 0.76550598476605, + "acc_norm_stderr": 0.00988520314324054 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..5a0d62eba9e09ae1ee6783fa2e6d5d765560dd65 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_4_lm-eval_global_step80108_2023-01-30-11-26-39_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.345, + "acc_stderr": 0.015039986742055235 + }, + "anli_r2": { + "acc": 0.325, + "acc_stderr": 0.014818724459095526 + }, + "anli_r3": { + "acc": 0.31416666666666665, + "acc_stderr": 0.013405399314984096 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.3647495361781076 + }, + "copa": { + "acc": 0.82, + "acc_stderr": 0.038612291966536955 + }, + "hellaswag": { + "acc": 0.4819757020513842, + "acc_stderr": 0.004986538243846636, + "acc_norm": 0.6387173869747063, + "acc_norm_stderr": 0.004793904922401888 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.5832675611681136, + "acc_stderr": 0.013856250072796322 + }, + "storycloze_2016": { + "acc": 0.7386424371993586, + "acc_stderr": 0.010160471460690485 + }, + "boolq": { + "acc": 0.6275229357798165, + "acc_stderr": 0.008455846866956085 + }, + "arc_easy": { + "acc": 0.6405723905723906, + "acc_stderr": 0.009845958893373766, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.00995373765654204 + }, + "arc_challenge": { + "acc": 0.30204778156996587, + "acc_stderr": 0.01341751914471642, + "acc_norm": 0.32764505119453924, + "acc_norm_stderr": 0.013715847940719344 + }, + "sciq": { + "acc": 0.92, + "acc_stderr": 0.008583336977753653, + "acc_norm": 0.907, + "acc_norm_stderr": 0.009188875634996702 + }, + "piqa": { + "acc": 0.7551686615886833, + "acc_stderr": 0.01003230910556879, + "acc_norm": 0.76550598476605, + "acc_norm_stderr": 0.00988520314324054 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_5.json b/4b284b28bc4/evaluation/4b284b28bc4_5.json new file mode 100644 index 0000000000000000000000000000000000000000..63413370f8d5a712578417c5c52021fcc4999356 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_5.json @@ -0,0 +1,59 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811475 + }, + "anli_r2": { + "acc": 0.316, + "acc_stderr": 0.014709193056057106 + }, + "anli_r3": { + "acc": 0.31666666666666665, + "acc_stderr": 0.013434078660827384 + }, + "cb": { + "acc": 0.30357142857142855, + "acc_stderr": 0.06199938655510754, + "f1": 0.2503507986266607 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4788886675960964, + "acc_stderr": 0.004985331652408345, + "acc_norm": 0.6412069308902609, + "acc_norm_stderr": 0.004786660691181937 + }, + "rte": { + "acc": 0.5740072202166066, + "acc_stderr": 0.02976495674177765 + }, + "winogrande": { + "acc": 0.5911602209944752, + "acc_stderr": 0.013816954295135684 + }, + "storycloze_2016": { + "acc": 0.7279529663281668, + "acc_stderr": 0.010290888060871242 + }, + "boolq": { + "acc": 0.6275229357798165, + "acc_stderr": 0.008455846866956086 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json b/4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..63413370f8d5a712578417c5c52021fcc4999356 --- /dev/null +++ b/4b284b28bc4/evaluation/4b284b28bc4_5_lm-eval_global_step80108_2023-01-30-11-26-39_5shots_backup.json @@ -0,0 +1,59 @@ +{ + "results": { + "anli_r1": { + "acc": 0.332, + "acc_stderr": 0.014899597242811475 + }, + "anli_r2": { + "acc": 0.316, + "acc_stderr": 0.014709193056057106 + }, + "anli_r3": { + "acc": 0.31666666666666665, + "acc_stderr": 0.013434078660827384 + }, + "cb": { + "acc": 0.30357142857142855, + "acc_stderr": 0.06199938655510754, + "f1": 0.2503507986266607 + }, + "copa": { + "acc": 0.8, + "acc_stderr": 0.040201512610368445 + }, + "hellaswag": { + "acc": 0.4788886675960964, + "acc_stderr": 0.004985331652408345, + "acc_norm": 0.6412069308902609, + "acc_norm_stderr": 0.004786660691181937 + }, + "rte": { + "acc": 0.5740072202166066, + "acc_stderr": 0.02976495674177765 + }, + "winogrande": { + "acc": 0.5911602209944752, + "acc_stderr": 0.013816954295135684 + }, + "storycloze_2016": { + "acc": 0.7279529663281668, + "acc_stderr": 0.010290888060871242 + }, + "boolq": { + "acc": 0.6275229357798165, + "acc_stderr": 0.008455846866956086 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..a02cdeb2d0fb2e83cfec200ea2a861e16a83658d --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.2871925225988394, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02880794237734816 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07031750338322859, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015362201736309874 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3012155634284117, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004546846231718025 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.10707093959955763, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019099253416430774 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03252619427180376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009041180535715348 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14439430798437106, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0030595449553106713 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.049917192299013896, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012014538250113653 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06757287768017098, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001414062975537199 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.2931917692240097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044525581749204875 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10327080072990603, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017814345648663893 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06720725044881726, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014407633141713585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2882066628071488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0042625595600572705 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10238598559666155, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.001793711398293261 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..49c23c28bb02078cc90f234ddd7a4b49c80e6ad1 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4674006237665374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03675015156688127 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07708396436923028, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001529013971135644 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.36235242098066, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00504305962540817 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11843411039548267, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001895876971489225 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035725346847754684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008827277116185362 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.17866943863024684, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003599106083841207 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05553061893758205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012200850248834274 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07294152863590639, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014064426998360536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3437168036630356, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004711682138202734 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11228525846628372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017420331023114827 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07337404630465882, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014496612765965426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3436003452114997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004616609203187745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11267135213673385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017806522868172628 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..63106064157d5ac66ca23beaaf4e88de18979ac2 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5250778439407279, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.025625496064299234 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0786831611862841, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014969718758498687 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3956998500897082, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005131459404981971 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1222441134481138, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019020155784340502 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03682232020364324, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009265440871287532 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.19779728011829195, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0038163400420330356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.057331612844470456, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001226447389142751 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07302274174131225, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001338647584797816 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.36618568072469376, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004622159858156629 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11353522248821303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001703018837429438 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07488527729009754, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001411146414679001 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3754608054763451, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004798151749531035 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11628410210009521, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017922853862293046 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..07e9956ca7a2cb31170aa39b45a144fe6a0e2847 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6240971401779115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03840020245332954 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07946921950052364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015734983149428565 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.41443709705557313, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0052063482293464285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12413425365513392, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018801730074252724 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03727374629297188, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010068745541143666 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20933728572588564, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0039729036741121115 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05836966723015618, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012404189838209753 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07340984582992345, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014110667213403662 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3803567082991153, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004624202286888448 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11463438747216832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016925693943895671 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07570628299965847, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0015124490142500056 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.39213432148715394, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004828378800565317 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11802649327609507, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017907209912884955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..fd84e71fa6d914f037f3487bf93a3f1668708829 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6460958847523566, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03922360003785139 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07760008602513783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001320690166501572 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4207626130570328, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0051343858175420766 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12269659017119199, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017597424238451932 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03635274690781486, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008330185169416092 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2129557453468375, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003851289229933982 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0577700863367864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001158809400479912 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07107029230772488, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001171387599016451 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.38200909854824977, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004464667845112728 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11231459598044416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015694568258752962 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07356474812940257, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012438027129333135 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.39729543398826606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004755911479219925 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11625392090888889, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016649015391818642 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2ec52c119850d2885b1115f3a8d8c4b9892b4949 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7283147266727299, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03506630990313516 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.0800529618846568, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014030415087213262 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4360056147654019, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005237161872126708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12670702220068714, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018353031821599214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.037349148159645094, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000853308549188945 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2217598521275178, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003994885901439153 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05951196634046783, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011799130523628795 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07255538123365188, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012004887370386942 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.39357574348180674, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004547413362244599 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11485528722019342, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001589365738462336 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07578798351325475, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012955589973183616 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.4104972172441367, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0048123434799543785 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11989099231027077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017190774588667075 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..544a4023081840913b672133e8fbbd03bee891a9 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.15222794945796614, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001854391491208858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.259279440153063, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027052349535794085 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17809655126532312, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018236388559653763 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.030497998250497895, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007413435249853986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05425261740808977, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001399269212332445 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03601951697280678, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008300101222283796 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11697977840134756, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012794261124303204 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.2062333696623697, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021597198817625285 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1384741786574652, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0012894950527783321 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1389877568936686, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016763555392295747 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23808538118300834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025032807703488534 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1628530750095549, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016530415994811475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.6192080325529026, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04050520870343064 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..82d4a736d1305850870d6ee981a1fc707d75ff8e --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17909391212556103, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0021684904700656395 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2986428547454232, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002901154619840759 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.20484942755932814, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001960405666708866 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04098672752953724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009402949935154375 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07250823843480829, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00171407343966866 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04757609861819433, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0009961877982725383 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.12586154047693615, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001447455908294373 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21783174447787615, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002284390945146576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.14525233898326748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001303267897245817 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.16826083836269604, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020324268022024173 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.28157870760475356, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002752878166728396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.19265194446101522, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018360173432108782 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.545643626821724, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.048869510836124584 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..4e1d923facc8e12c1c46d014d05f1ebf67029d08 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2332474242904849, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0031768172038711467 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.29410268890498986, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002958828414438297 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.21836409386191818, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001989518194370473 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06566711930000235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0019413943466877388 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.07670871046547496, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016787739182338268 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05650249608530642, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011487340298214372 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1752085713341631, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0026584779502129303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.21819240455646285, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002276168106614223 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1603606067900483, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014157142739287894 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.22112816157170134, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0030769042746772593 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2776979400072693, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.002813335835445409 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20617794726883687, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018867289872942854 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.123719853905042, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09435758546693512 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..24a0440a29d3b7011aa3dfade5bbbc3360aa00a4 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.22344987630747953, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0038946320752308397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.228687908011972, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033634865916312945 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1819863863181494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0024282963824572346 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.06821362116644611, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0023091245896002433 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06243742554002245, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.001665626169617889 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05005886014366939, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012507484027525577 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.17380086539073145, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.003294779021368952 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.1728818390227638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0026287282834976822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13680266609806765, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001821525871477116 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.21200019749157653, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.003735810714751958 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.21587781354640423, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003186682013692883 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17195845477450572, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023005495305984053 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.5006949715383118, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.08071997944617754 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..b55697c96f61323636f982ca24b521715e03d09c --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.06907034391802865, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002842925157819051 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.07641790711530233, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028071608967584296 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05800966504497334, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002055695120200674 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.020323118606363738, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013765538659170954 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.020689214217856734, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0011098261782535187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.015594437236270214, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008033551932234715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.05326819078634553, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002300594360643888 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.05823666899257084, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0021806250615139192 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04363217232305618, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015475509311173715 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.06512227133445396, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002711627122094951 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07184593801381658, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026558284841563095 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05441077080277394, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019346325978603714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.4203638205606742, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04062156232814086 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..923867ce2dbfa74c3afbdc5413760609875713e8 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.011141473368752484, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012984382630292146 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.011356368661606248, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.001166446625113568 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.008814606254748622, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008820141996097188 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.0036979579695363266, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00069734856115032 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0030116660387216835, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00039145176963269976 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0024833328621297794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0003381158590391299 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.008749076310088717, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010822814361618575 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.008702418570223684, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0008937526565969341 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.0067361341505456805, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006778741138953477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.010561687151292019, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012526496630397305 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.010693942253000091, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001098581608402249 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008282398875850052, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008304343325046589 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.610620675193976e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 6.336079313722214e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..bf4cdde76e1054220e9f8fea9052d90ababaf23b --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.8175837539659996, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.06903661804515053 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.067410137720777, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0015267778764287776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.11905569111444682, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002272837057404917 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.07913875454413057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0015242869094641899 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.00855571479752271, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.000610070681102783 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.01452701665726237, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0008619364610158222 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.010022915068112901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0005999619167956302 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.06588530481318243, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001456682483397516 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.11730596366118605, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0022143035280371703 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.07774011765647057, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014739968390031794 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.053281568752607505, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0013584259494467741 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.0896663423829268, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0017758029746831937 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.06060524038626222, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0012344605903209444 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..7f1d2848222c62dd4757ddc80cec70c0e7f6a883 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.360023014480657, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14523312599938573 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5368002507342822, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003197714150333534 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4204347436029918, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029932330301144583 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4457059970163705, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0023475038584115206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.25010730577128015, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0025416736419787912 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19320140085672308, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021059841419488645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.20489136085595536, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019458972766888125 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.390181985500488, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002842772643373759 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.30202346410754144, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002382966736707436 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.321161092281206, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001995161634007105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4390382418075218, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003082038398447118 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.34172069570320107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026874775375555484 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3629740875330838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00225666081685001 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..d688d2017c5466f667feb29de1b92e60bd0bde12 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 13.284050272128148, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1770460818661209 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5714641277328295, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032799480051059426 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4468451656698964, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029010255062063995 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.47571760194079554, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002265550724177173 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.28407643182119585, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002743346128512875 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.21834852344926514, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021773726271045162 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2325284196471626, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020182406639818696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4201198181536731, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002974299270315645 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.32583054256145155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002409815129643757 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3475057918065784, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020325138450125527 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4715445246123803, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031935178217488633 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.36737757013379746, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002675859339354407 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.391612619922272, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002268675240750965 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..a2d240f85885197a72f00dbc3e1dc75d6f2d73de --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.26351682307982, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16518466746497154 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5781766953393467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003232493897043414 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.45846733751843627, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002862839752183954 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4869136869814224, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002252377471622215 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2936437583754312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002766357156065721 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2296605325605603, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002239048819483412 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.24388713793667496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002081168246828599 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.42323041568862063, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00299893377825116 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3336509571396467, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002463831973937884 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35476450327695513, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002131136688683226 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.47879240286661645, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0032211942725843233 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3788603945042402, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.00272506620717064 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.40259898572566205, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023428037926685654 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..f551805844e34b1e07ab4049e2ab76e2e8ccfc7d --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.568456206535862, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.22714934644025087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5824109780846385, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032128656177573896 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46303711017335764, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028550395782162385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.49248119559722, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002261462995557444 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2975497604665679, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002721358209608908 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2337393664335435, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022535476965624083 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.24852828649672148, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020894492308823178 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4258973373500384, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029570811260566745 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.33738654578439453, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002480938975354226 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35908240651080353, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021535460141111546 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.48186572666282207, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031731113183941597 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3829876385027774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027332507970680904 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.4073362598730636, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002350552518906019 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c4105fd18633c49f0cc7b71f3924ed30452733b8 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.41705650902542, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.25051935957566557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5768002584328835, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.003210936196026848 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4663927464698937, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002827189059265761 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4912358527300658, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002175083911938509 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2929623409468226, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026747147184981385 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2340255710785021, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002212699322566578 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.24634621400768708, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002025833005255322 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.41839619423807317, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0028660522019280242 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3383776885015685, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024683861897523925 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.35597711356903294, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002065825901549598 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.47696170422006223, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003098280726576402 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3864090320075692, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027224907209281047 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.4066143622258549, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002273545252564866 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_0.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..104c97b38d065588316e54576125d512728d685e --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.17908873736938333, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0025998582918028913 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3401581574130041, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00466124187806079 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.22142206258867178, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002657386115409225 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.04001978076164408, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00143853172997629 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08351989855551167, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002908378017045409 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0511787638415587, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017012685833384458 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.13280826032727655, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0019720134198053826 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2547013277658435, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036980006484927316 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16457376114227693, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002030624196291875 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.13820058290248866, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0020236107693674434 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.267820674383363, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00406330233863675 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.1721074292499855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002183282479578269 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.242352554457821, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06303290180001715 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_1.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..d8e37fcafacf7bef1d44114e384bda9403b15ac2 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14242240386934715, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018474131654211898 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.34972417416826374, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00425304183688617 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19996151191164502, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0024642136191649943 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03180784324852304, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010605401760687887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08122973807701163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0027530425274732025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04515071736102295, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0014958437813087835 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10785508103994479, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001342675196432051 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26778299938242467, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0033228224655886393 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15185456372717573, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0018177177985044526 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11366701902108305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0014987342312601044 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.28182954712893066, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0036825745017303583 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16002746501050483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002040753526176795 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.8494036251126653, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06618274112679547 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_2.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..0eba155cb790e4d91f53d19f13efb1cf9337f415 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14299555153351842, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019414837646538163 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.35054680023331225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004405959066068025 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.20066139374359412, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002583892537384957 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03372540037348309, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011433968826317544 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08536452458478411, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002896468085770253 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.047730927310845786, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016012182287873963 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1096711314096099, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014240979490601532 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2713048964277041, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003435168695225616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15423466735433622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019141237080501433 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11432354351222646, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016135722781519138 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.282399261929163, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038458320187164767 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16073844532804232, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021770573542344426 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.9553979785993556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08114119801334493 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_3.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7c6852c10a3e9933a50de1316b24487e0e8c42b0 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.14095312565957024, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021252189798371952 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3355112558235932, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00482038827448099 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19424816162568462, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002758326610864678 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03309503575773881, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011583105646001772 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08361606756802129, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0030274265177690917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04656621187743751, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001620760377451161 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10750740692234483, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001617470504570499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.25822344513710505, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038342113677037708 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.14844786782549954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002106590246901777 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11272654966890937, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017739663038892394 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2704314687293562, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0041899251242752955 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15564298400864224, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002325750760252049 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.9899134724216982, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.08189694158167818 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_4.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..0eba562ddc392772bb4e39debfb4656f1f790041 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04182867272383347, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002516912133041032 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08068317129523372, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004630110172525636 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05061277872693584, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028337392524855837 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.00821242826897234, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.000780085704496626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.018229654639556718, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0016649615737807282 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.010817994039374855, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0009898135477985153 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03237406484505732, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0020316037823294927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06169464366711305, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035477374875163793 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.038452182498697945, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002124389719631779 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03434532775935572, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021605467450954603 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06559190633881043, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038270052788935026 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.040956310132902136, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023102690323821627 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.851414956048288, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10142038717157466 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_5.json b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e7e59cd1fb12b966e7a1b35d969e4fc2db9c8379 --- /dev/null +++ b/4b284b28bc4/evaluation/generation/slim.4b284b28bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.0021529907822185413, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0006324831746531657 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.001764574432238271, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005172389485732619 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.0019051009413407058, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0005593343846105789 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0001457415441877151, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 8.410282821934284e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00011802662746058974, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 6.837858900511585e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0001299594149643802, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 7.503370260000825e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0017522974182971845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000503946684193751 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0014399120468157657, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0004056116387189239 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.0015504765957310557, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0004402385576724586 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0018058994766162238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0005178578773672206 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.001474217364139951, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004127036649374616 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0015923123485654278, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00044995087651658854 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.3013943780107486e-40, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 6.513754776072693e-35 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b28bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_0.json b/4b284b42bc4/evaluation/4b284b42bc4_0.json new file mode 100644 index 0000000000000000000000000000000000000000..3296a9419420e5ec52b95fea7b62c31a9f88794e --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.331, + "acc_stderr": 0.014888272588203931 + }, + "anli_r2": { + "acc": 0.342, + "acc_stderr": 0.01500870618212173 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.013680495725767784 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.45393112410656267 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.4833698466440948, + "acc_stderr": 0.004987020679861267, + "acc_norm": 0.63433578968333, + "acc_norm_stderr": 0.004806316342709393 + }, + "rte": { + "acc": 0.5776173285198556, + "acc_stderr": 0.029731622646495887 + }, + "winogrande": { + "acc": 0.5864246250986582, + "acc_stderr": 0.013840971763195303 + }, + "storycloze_2016": { + "acc": 0.7204703367183325, + "acc_stderr": 0.01037770209970486 + }, + "boolq": { + "acc": 0.5253822629969419, + "acc_stderr": 0.0087337795418535 + }, + "arc_easy": { + "acc": 0.6224747474747475, + "acc_stderr": 0.00994722783346943, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.010215708295494117 + }, + "arc_challenge": { + "acc": 0.27986348122866894, + "acc_stderr": 0.013119040897725922, + "acc_norm": 0.29266211604095566, + "acc_norm_stderr": 0.01329591610361942 + }, + "sciq": { + "acc": 0.837, + "acc_stderr": 0.011686212712746849, + "acc_norm": 0.757, + "acc_norm_stderr": 0.013569640199177458 + }, + "piqa": { + "acc": 0.7448313384113167, + "acc_stderr": 0.010171571592521822, + "acc_norm": 0.76550598476605, + "acc_norm_stderr": 0.00988520314324054 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..3296a9419420e5ec52b95fea7b62c31a9f88794e --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_0_lm-eval_global_step80108_2023-01-30-11-26-38_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.331, + "acc_stderr": 0.014888272588203931 + }, + "anli_r2": { + "acc": 0.342, + "acc_stderr": 0.01500870618212173 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.013680495725767784 + }, + "cb": { + "acc": 0.5357142857142857, + "acc_stderr": 0.06724777654937658, + "f1": 0.45393112410656267 + }, + "copa": { + "acc": 0.75, + "acc_stderr": 0.04351941398892446 + }, + "hellaswag": { + "acc": 0.4833698466440948, + "acc_stderr": 0.004987020679861267, + "acc_norm": 0.63433578968333, + "acc_norm_stderr": 0.004806316342709393 + }, + "rte": { + "acc": 0.5776173285198556, + "acc_stderr": 0.029731622646495887 + }, + "winogrande": { + "acc": 0.5864246250986582, + "acc_stderr": 0.013840971763195303 + }, + "storycloze_2016": { + "acc": 0.7204703367183325, + "acc_stderr": 0.01037770209970486 + }, + "boolq": { + "acc": 0.5253822629969419, + "acc_stderr": 0.0087337795418535 + }, + "arc_easy": { + "acc": 0.6224747474747475, + "acc_stderr": 0.00994722783346943, + "acc_norm": 0.5462962962962963, + "acc_norm_stderr": 0.010215708295494117 + }, + "arc_challenge": { + "acc": 0.27986348122866894, + "acc_stderr": 0.013119040897725922, + "acc_norm": 0.29266211604095566, + "acc_norm_stderr": 0.01329591610361942 + }, + "sciq": { + "acc": 0.837, + "acc_stderr": 0.011686212712746849, + "acc_norm": 0.757, + "acc_norm_stderr": 0.013569640199177458 + }, + "piqa": { + "acc": 0.7448313384113167, + "acc_stderr": 0.010171571592521822, + "acc_norm": 0.76550598476605, + "acc_norm_stderr": 0.00988520314324054 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_1.json b/4b284b42bc4/evaluation/4b284b42bc4_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cf339e43bef0da4ef04f37f057d335bb54e5bfc7 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.31, + "acc_stderr": 0.014632638658632902 + }, + "anli_r2": { + "acc": 0.31, + "acc_stderr": 0.014632638658632905 + }, + "anli_r3": { + "acc": 0.3283333333333333, + "acc_stderr": 0.013562032919529017 + }, + "cb": { + "acc": 0.3392857142857143, + "acc_stderr": 0.06384226561930825, + "f1": 0.29749748849204566 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4803823939454292, + "acc_stderr": 0.004985939292819582, + "acc_norm": 0.6294562836088429, + "acc_norm_stderr": 0.004819633668832538 + }, + "rte": { + "acc": 0.44765342960288806, + "acc_stderr": 0.02993107036293953 + }, + "winogrande": { + "acc": 0.5887924230465666, + "acc_stderr": 0.013829128358676874 + }, + "storycloze_2016": { + "acc": 0.7049706039551042, + "acc_stderr": 0.010546232606962289 + }, + "boolq": { + "acc": 0.5522935779816514, + "acc_stderr": 0.008697094687974059 + }, + "arc_easy": { + "acc": 0.6262626262626263, + "acc_stderr": 0.009927267058259621, + "acc_norm": 0.5934343434343434, + "acc_norm_stderr": 0.010079056419223527 + }, + "arc_challenge": { + "acc": 0.2883959044368601, + "acc_stderr": 0.013238394422428173, + "acc_norm": 0.3148464163822526, + "acc_norm_stderr": 0.01357265770308495 + }, + "sciq": { + "acc": 0.892, + "acc_stderr": 0.0098200016513457, + "acc_norm": 0.869, + "acc_norm_stderr": 0.010674874844837954 + }, + "piqa": { + "acc": 0.7486398258977149, + "acc_stderr": 0.010121156016819259, + "acc_norm": 0.7633297062023939, + "acc_norm_stderr": 0.009916841655042809 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..cf339e43bef0da4ef04f37f057d335bb54e5bfc7 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_1_lm-eval_global_step80108_2023-01-30-11-26-38_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.31, + "acc_stderr": 0.014632638658632902 + }, + "anli_r2": { + "acc": 0.31, + "acc_stderr": 0.014632638658632905 + }, + "anli_r3": { + "acc": 0.3283333333333333, + "acc_stderr": 0.013562032919529017 + }, + "cb": { + "acc": 0.3392857142857143, + "acc_stderr": 0.06384226561930825, + "f1": 0.29749748849204566 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4803823939454292, + "acc_stderr": 0.004985939292819582, + "acc_norm": 0.6294562836088429, + "acc_norm_stderr": 0.004819633668832538 + }, + "rte": { + "acc": 0.44765342960288806, + "acc_stderr": 0.02993107036293953 + }, + "winogrande": { + "acc": 0.5887924230465666, + "acc_stderr": 0.013829128358676874 + }, + "storycloze_2016": { + "acc": 0.7049706039551042, + "acc_stderr": 0.010546232606962289 + }, + "boolq": { + "acc": 0.5522935779816514, + "acc_stderr": 0.008697094687974059 + }, + "arc_easy": { + "acc": 0.6262626262626263, + "acc_stderr": 0.009927267058259621, + "acc_norm": 0.5934343434343434, + "acc_norm_stderr": 0.010079056419223527 + }, + "arc_challenge": { + "acc": 0.2883959044368601, + "acc_stderr": 0.013238394422428173, + "acc_norm": 0.3148464163822526, + "acc_norm_stderr": 0.01357265770308495 + }, + "sciq": { + "acc": 0.892, + "acc_stderr": 0.0098200016513457, + "acc_norm": 0.869, + "acc_norm_stderr": 0.010674874844837954 + }, + "piqa": { + "acc": 0.7486398258977149, + "acc_stderr": 0.010121156016819259, + "acc_norm": 0.7633297062023939, + "acc_norm_stderr": 0.009916841655042809 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_2.json b/4b284b42bc4/evaluation/4b284b42bc4_2.json new file mode 100644 index 0000000000000000000000000000000000000000..22cf079e8ba914e5c154466c57a2d14c743145c3 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.306, + "acc_stderr": 0.014580006055436969 + }, + "anli_r2": { + "acc": 0.33, + "acc_stderr": 0.014876872027456734 + }, + "anli_r3": { + "acc": 0.3308333333333333, + "acc_stderr": 0.013588208070709007 + }, + "cb": { + "acc": 0.25, + "acc_stderr": 0.058387420812114225, + "f1": 0.2376010151606224 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.47679745070703045, + "acc_stderr": 0.004984405935541087, + "acc_norm": 0.6308504282015535, + "acc_norm_stderr": 0.004815882719278393 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.6101026045777427, + "acc_stderr": 0.013707547317008462 + }, + "storycloze_2016": { + "acc": 0.7199358631747729, + "acc_stderr": 0.01038376499392048 + }, + "boolq": { + "acc": 0.5889908256880734, + "acc_stderr": 0.008605429733982185 + }, + "arc_easy": { + "acc": 0.6388888888888888, + "acc_stderr": 0.00985601342581124, + "acc_norm": 0.6182659932659933, + "acc_norm_stderr": 0.009968648851839672 + }, + "arc_challenge": { + "acc": 0.29948805460750855, + "acc_stderr": 0.013385021637313565, + "acc_norm": 0.3148464163822526, + "acc_norm_stderr": 0.01357265770308495 + }, + "sciq": { + "acc": 0.902, + "acc_stderr": 0.009406619184621238, + "acc_norm": 0.89, + "acc_norm_stderr": 0.009899393819724444 + }, + "piqa": { + "acc": 0.750272034820457, + "acc_stderr": 0.010099232969867488, + "acc_norm": 0.763873775843308, + "acc_norm_stderr": 0.009908965890558218 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..22cf079e8ba914e5c154466c57a2d14c743145c3 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_2_lm-eval_global_step80108_2023-01-30-11-26-38_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.306, + "acc_stderr": 0.014580006055436969 + }, + "anli_r2": { + "acc": 0.33, + "acc_stderr": 0.014876872027456734 + }, + "anli_r3": { + "acc": 0.3308333333333333, + "acc_stderr": 0.013588208070709007 + }, + "cb": { + "acc": 0.25, + "acc_stderr": 0.058387420812114225, + "f1": 0.2376010151606224 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.47679745070703045, + "acc_stderr": 0.004984405935541087, + "acc_norm": 0.6308504282015535, + "acc_norm_stderr": 0.004815882719278393 + }, + "rte": { + "acc": 0.48736462093862815, + "acc_stderr": 0.030086851767188564 + }, + "winogrande": { + "acc": 0.6101026045777427, + "acc_stderr": 0.013707547317008462 + }, + "storycloze_2016": { + "acc": 0.7199358631747729, + "acc_stderr": 0.01038376499392048 + }, + "boolq": { + "acc": 0.5889908256880734, + "acc_stderr": 0.008605429733982185 + }, + "arc_easy": { + "acc": 0.6388888888888888, + "acc_stderr": 0.00985601342581124, + "acc_norm": 0.6182659932659933, + "acc_norm_stderr": 0.009968648851839672 + }, + "arc_challenge": { + "acc": 0.29948805460750855, + "acc_stderr": 0.013385021637313565, + "acc_norm": 0.3148464163822526, + "acc_norm_stderr": 0.01357265770308495 + }, + "sciq": { + "acc": 0.902, + "acc_stderr": 0.009406619184621238, + "acc_norm": 0.89, + "acc_norm_stderr": 0.009899393819724444 + }, + "piqa": { + "acc": 0.750272034820457, + "acc_stderr": 0.010099232969867488, + "acc_norm": 0.763873775843308, + "acc_norm_stderr": 0.009908965890558218 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_3.json b/4b284b42bc4/evaluation/4b284b42bc4_3.json new file mode 100644 index 0000000000000000000000000000000000000000..61b8c23bda963770070ae812b469efca25c03862 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.314, + "acc_stderr": 0.014683991951087962 + }, + "anli_r2": { + "acc": 0.342, + "acc_stderr": 0.015008706182121734 + }, + "anli_r3": { + "acc": 0.32416666666666666, + "acc_stderr": 0.013517438120881636 + }, + "cb": { + "acc": 0.4107142857142857, + "acc_stderr": 0.0663363415035954, + "f1": 0.37437732746529967 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4823740290778729, + "acc_stderr": 0.004986680048438317, + "acc_norm": 0.6320454092810197, + "acc_norm_stderr": 0.004812633280078256 + }, + "rte": { + "acc": 0.5306859205776173, + "acc_stderr": 0.030039730592197812 + }, + "winogrande": { + "acc": 0.5887924230465666, + "acc_stderr": 0.013829128358676878 + }, + "storycloze_2016": { + "acc": 0.7215392838054516, + "acc_stderr": 0.010365521460604417 + }, + "boolq": { + "acc": 0.599388379204893, + "acc_stderr": 0.008570545612096372 + }, + "arc_easy": { + "acc": 0.6342592592592593, + "acc_stderr": 0.00988298806941883, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.00995373765654204 + }, + "arc_challenge": { + "acc": 0.29180887372013653, + "acc_stderr": 0.013284525292403503, + "acc_norm": 0.3046075085324232, + "acc_norm_stderr": 0.01344952210993249 + }, + "sciq": { + "acc": 0.917, + "acc_stderr": 0.00872852720607479, + "acc_norm": 0.902, + "acc_norm_stderr": 0.009406619184621236 + }, + "piqa": { + "acc": 0.7600652883569097, + "acc_stderr": 0.009963625892809544, + "acc_norm": 0.7633297062023939, + "acc_norm_stderr": 0.009916841655042809 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..61b8c23bda963770070ae812b469efca25c03862 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_3_lm-eval_global_step80108_2023-01-30-11-26-38_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.314, + "acc_stderr": 0.014683991951087962 + }, + "anli_r2": { + "acc": 0.342, + "acc_stderr": 0.015008706182121734 + }, + "anli_r3": { + "acc": 0.32416666666666666, + "acc_stderr": 0.013517438120881636 + }, + "cb": { + "acc": 0.4107142857142857, + "acc_stderr": 0.0663363415035954, + "f1": 0.37437732746529967 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4823740290778729, + "acc_stderr": 0.004986680048438317, + "acc_norm": 0.6320454092810197, + "acc_norm_stderr": 0.004812633280078256 + }, + "rte": { + "acc": 0.5306859205776173, + "acc_stderr": 0.030039730592197812 + }, + "winogrande": { + "acc": 0.5887924230465666, + "acc_stderr": 0.013829128358676878 + }, + "storycloze_2016": { + "acc": 0.7215392838054516, + "acc_stderr": 0.010365521460604417 + }, + "boolq": { + "acc": 0.599388379204893, + "acc_stderr": 0.008570545612096372 + }, + "arc_easy": { + "acc": 0.6342592592592593, + "acc_stderr": 0.00988298806941883, + "acc_norm": 0.6212121212121212, + "acc_norm_stderr": 0.00995373765654204 + }, + "arc_challenge": { + "acc": 0.29180887372013653, + "acc_stderr": 0.013284525292403503, + "acc_norm": 0.3046075085324232, + "acc_norm_stderr": 0.01344952210993249 + }, + "sciq": { + "acc": 0.917, + "acc_stderr": 0.00872852720607479, + "acc_norm": 0.902, + "acc_norm_stderr": 0.009406619184621236 + }, + "piqa": { + "acc": 0.7600652883569097, + "acc_stderr": 0.009963625892809544, + "acc_norm": 0.7633297062023939, + "acc_norm_stderr": 0.009916841655042809 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_4.json b/4b284b42bc4/evaluation/4b284b42bc4_4.json new file mode 100644 index 0000000000000000000000000000000000000000..758e16da35807d91bcb07151f22b10f3a8b3d60b --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.014933117490932572 + }, + "anli_r2": { + "acc": 0.352, + "acc_stderr": 0.015110404505648663 + }, + "anli_r3": { + "acc": 0.3233333333333333, + "acc_stderr": 0.013508372867300215 + }, + "cb": { + "acc": 0.4107142857142857, + "acc_stderr": 0.06633634150359541, + "f1": 0.3098047785547785 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262 + }, + "hellaswag": { + "acc": 0.47849034056960765, + "acc_stderr": 0.00498516207433611, + "acc_norm": 0.6403106950806612, + "acc_norm_stderr": 0.00478928472395585 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.595895816890292, + "acc_stderr": 0.01379161066467086 + }, + "storycloze_2016": { + "acc": 0.7279529663281668, + "acc_stderr": 0.01029088806087124 + }, + "boolq": { + "acc": 0.6143730886850153, + "acc_stderr": 0.008513189460768057 + }, + "arc_easy": { + "acc": 0.6447811447811448, + "acc_stderr": 0.009820245899287119, + "acc_norm": 0.6195286195286195, + "acc_norm_stderr": 0.009962305992058567 + }, + "arc_challenge": { + "acc": 0.295221843003413, + "acc_stderr": 0.013329750293382316, + "acc_norm": 0.3046075085324232, + "acc_norm_stderr": 0.013449522109932487 + }, + "sciq": { + "acc": 0.918, + "acc_stderr": 0.008680515615523705, + "acc_norm": 0.902, + "acc_norm_stderr": 0.009406619184621224 + }, + "piqa": { + "acc": 0.7562568008705114, + "acc_stderr": 0.010017199471500619, + "acc_norm": 0.7622415669205659, + "acc_norm_stderr": 0.009932525779525492 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..758e16da35807d91bcb07151f22b10f3a8b3d60b --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_4_lm-eval_global_step80108_2023-01-30-11-26-38_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.335, + "acc_stderr": 0.014933117490932572 + }, + "anli_r2": { + "acc": 0.352, + "acc_stderr": 0.015110404505648663 + }, + "anli_r3": { + "acc": 0.3233333333333333, + "acc_stderr": 0.013508372867300215 + }, + "cb": { + "acc": 0.4107142857142857, + "acc_stderr": 0.06633634150359541, + "f1": 0.3098047785547785 + }, + "copa": { + "acc": 0.78, + "acc_stderr": 0.04163331998932262 + }, + "hellaswag": { + "acc": 0.47849034056960765, + "acc_stderr": 0.00498516207433611, + "acc_norm": 0.6403106950806612, + "acc_norm_stderr": 0.00478928472395585 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.030052303463143706 + }, + "winogrande": { + "acc": 0.595895816890292, + "acc_stderr": 0.01379161066467086 + }, + "storycloze_2016": { + "acc": 0.7279529663281668, + "acc_stderr": 0.01029088806087124 + }, + "boolq": { + "acc": 0.6143730886850153, + "acc_stderr": 0.008513189460768057 + }, + "arc_easy": { + "acc": 0.6447811447811448, + "acc_stderr": 0.009820245899287119, + "acc_norm": 0.6195286195286195, + "acc_norm_stderr": 0.009962305992058567 + }, + "arc_challenge": { + "acc": 0.295221843003413, + "acc_stderr": 0.013329750293382316, + "acc_norm": 0.3046075085324232, + "acc_norm_stderr": 0.013449522109932487 + }, + "sciq": { + "acc": 0.918, + "acc_stderr": 0.008680515615523705, + "acc_norm": 0.902, + "acc_norm_stderr": 0.009406619184621224 + }, + "piqa": { + "acc": 0.7562568008705114, + "acc_stderr": 0.010017199471500619, + "acc_norm": 0.7622415669205659, + "acc_norm_stderr": 0.009932525779525492 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_5.json b/4b284b42bc4/evaluation/4b284b42bc4_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ddad2a41639e40b03f98968f1bc0776dfbd23de4 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_5.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.323, + "acc_stderr": 0.014794927843348639 + }, + "anli_r2": { + "acc": 0.332, + "acc_stderr": 0.014899597242811475 + }, + "anli_r3": { + "acc": 0.3275, + "acc_stderr": 0.013553211167251961 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.32470238095238096 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4792869946225851, + "acc_stderr": 0.004985498055190358, + "acc_norm": 0.6384186417048396, + "acc_norm_stderr": 0.004794764843685288 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.030094698123239966 + }, + "winogrande": { + "acc": 0.5880031570639306, + "acc_stderr": 0.013833112857645937 + }, + "storycloze_2016": { + "acc": 0.7306253340459647, + "acc_stderr": 0.010258997754057014 + }, + "boolq": { + "acc": 0.618960244648318, + "acc_stderr": 0.008493937524439337 + }, + "arc_easy": { + "acc": 0.6426767676767676, + "acc_stderr": 0.00983320561246312, + "acc_norm": 0.625, + "acc_norm_stderr": 0.009933992677987828 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/4b284b42bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json b/4b284b42bc4/evaluation/4b284b42bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..ddad2a41639e40b03f98968f1bc0776dfbd23de4 --- /dev/null +++ b/4b284b42bc4/evaluation/4b284b42bc4_5_lm-eval_global_step80108_2023-01-30-11-26-38_5shots_backup.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.323, + "acc_stderr": 0.014794927843348639 + }, + "anli_r2": { + "acc": 0.332, + "acc_stderr": 0.014899597242811475 + }, + "anli_r3": { + "acc": 0.3275, + "acc_stderr": 0.013553211167251961 + }, + "cb": { + "acc": 0.39285714285714285, + "acc_stderr": 0.0658538889806635, + "f1": 0.32470238095238096 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4792869946225851, + "acc_stderr": 0.004985498055190358, + "acc_norm": 0.6384186417048396, + "acc_norm_stderr": 0.004794764843685288 + }, + "rte": { + "acc": 0.5054151624548736, + "acc_stderr": 0.030094698123239966 + }, + "winogrande": { + "acc": 0.5880031570639306, + "acc_stderr": 0.013833112857645937 + }, + "storycloze_2016": { + "acc": 0.7306253340459647, + "acc_stderr": 0.010258997754057014 + }, + "boolq": { + "acc": 0.618960244648318, + "acc_stderr": 0.008493937524439337 + }, + "arc_easy": { + "acc": 0.6426767676767676, + "acc_stderr": 0.00983320561246312, + "acc_norm": 0.625, + "acc_norm_stderr": 0.009933992677987828 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..d9f096a071d0dd995c102145ae7db0dea85b6a15 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.4273206525263921, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05364575256139351 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07516332695488044, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0017494053619516534 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3000002487080154, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004693559983294075 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11230037179769856, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0021352190368007454 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03590952594711019, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00111479086151588 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.14620648751654972, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003234756290211622 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05344453588119793, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013515518560690634 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07168459834878929, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015668817269601622 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.29090649446634975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004555120095447256 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10771019147913463, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0019671591552995436 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07152558346509422, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001609325107400611 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.28799432197952096, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004451937577964067 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10717155022915739, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019927955387855968 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cc118b9411af385ee148eb385316ba961a52dc0f --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5663724921835591, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03793270967595185 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08217512985928499, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0015069260165856669 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.3920117360421087, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005374085547490371 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12756709737220784, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002030468381881967 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03886435424085205, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009357648884166175 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1952944224324245, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003818475633981706 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0604895960614538, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0013006193696080512 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07729829270693146, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013303444161253287 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3723170212800364, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.005024550440352146 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.12035361435811785, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00181560137606144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07779518642109406, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0014090714637609509 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3702218835789373, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004905029541255394 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1206781467869011, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018904175388126346 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..021361d3ef48697e4dfb923d8cc43941aec508bc --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6313130510239234, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.02727704631142144 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08219107059651697, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013422258523153536 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4151352004015792, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005332508949374494 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.12940251519432414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018563230360622849 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03845495568474405, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008134611533833349 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20877882384960775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003907086651198888 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06086364336249341, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011655118907789416 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07626492600480415, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011790764283155961 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.38715675983971815, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004836346504741043 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.12014889628673367, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0016294871735385334 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07790995067901116, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012592690527861467 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3934974316637387, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004930425413445172 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12261763705509457, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017345722248776798 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..edb5a2a838c5c10597ca0aaef61be96e63b956b8 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6636681020720647, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03135011211987113 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08335165133186997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013566594758271408 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4363611958728497, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005466144892220358 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13179598729950448, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018608653117238654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.038821951607757095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008309945118711307 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.21925719264824975, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003975211126463013 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06172653863702163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011849569250187196 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07607458853736868, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011617831771301823 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3989579329521947, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004797365715247612 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.1203852422828686, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015949578691764172 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07866244698502353, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012631336576740743 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.41007264952780736, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004984839846868383 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12430003630857168, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017268968711559307 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..62279fb027da1d419d2d529f65279a0019c538dd --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7445914925255956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04471373508927592 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08335134599012016, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013601845542222193 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4369343435538318, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00545111390697167 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13191715691415712, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001848451971487058 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03875848226958462, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008040225568103941 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.22226421540491542, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004032909658463521 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.061883789388597316, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011527436830992247 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0754476375575414, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001174153361661276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3964743307100715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004828941243739242 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11940663728897792, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015860574558582763 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07858636401791905, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0012777097204336968 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.41038517941804836, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.005000143163857465 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12423777278329896, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017238674224559544 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2f80ba9d717785605111df0ef725be26559902ea --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.8005310739494581, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.033724293174082695 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.08338013300594908, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001260130864731573 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.44613389749037924, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0053462691621041685 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.13259315662099147, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0017356987872365276 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03871058510533021, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007593773025926396 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.22607554732619956, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004027973657557073 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.06197974009288303, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001091686612001906 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07523239671078998, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011063220739786807 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.40282994585844645, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046937441735825924 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.11956303513601427, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001512266881380029 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07851137617969797, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001188072848467214 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.41817890606333163, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004881701441444398 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.12469867862475097, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016220089487202947 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..867a2ed389b8632ce6dc228169bdabd95f07b1b2 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.14692825480864516, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0018425931958625118 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2512970070836738, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002648136861575683 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.172029086258773, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018269882294786313 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.027966624066931015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007198127679157396 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.050572163826476904, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00137135380680017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03327297097578151, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008291280040639999 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11468189898145331, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012927912945671872 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20361426191617446, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002161893564032071 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13597766469494962, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013133399966305693 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1348221046642956, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0016706708007563362 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23199038101436165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0024449378448124903 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.15820053127675784, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0016635461401018714 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.414984862410896, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05940234677879655 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..1bbc339db0b865a801616c82a014714cb2f4ed57 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.20096070137159358, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0024183404012715405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.31553844384377067, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002899492197080727 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.2221915237870407, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019942889407287214 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05245104495325915, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012835508781176563 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0826631709797398, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017188300283128646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.056766090400891124, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010805710237233974 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14393356016704015, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001799791687854001 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.23074840768589486, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022312022411283892 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15912396957362257, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013601645244304275 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18819591770649413, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002281534419296507 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.2963346702924994, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027562668344971878 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20821150035815492, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018771242477053024 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 2.836407401710481, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04043755617370114 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6677c8754802b47fdb3afc33df242e402e7a72 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.20680907589782577, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002488502986523018 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.30989444650796616, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002753786143548591 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22254451379263313, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001915395486050023 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05499744970071221, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.001313828428933835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08212190745009494, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016893036964833342 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.057748452491246806, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001071605656207478 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14843146397136125, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0018718282405222096 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.22714968032490238, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0022095657686829933 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.15964814622161871, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013347717397906025 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.19471899528471098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002355009451192118 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.292336561157753, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026262120649372398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.20960062484582434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018046035863626521 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0672692499706633, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.049106354646547744 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..f36f38e829da374795de7f700da6f79cf2d35f36 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17474019948410954, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027474429834987804 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25521371457329683, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033152648455079072 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.1820115847047415, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002231556986150065 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.045362687902130126, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012564421439368422 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06804888392817302, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0016992089106080323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04689779702656875, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001034952296288115 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.1265226291981258, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.002108936479136444 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.18709591094229805, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002591170589040495 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13086000378258256, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015951624748139174 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1650854923795236, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0026195222440337307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.24111486429370707, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.003150424029915772 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17171382413552935, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021029154857551075 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0771001660724235, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.10166627860233955 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..ab78bda4e0e9da3accb538a0a861fd30e75a911f --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.05686654081896349, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002233440497229162 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08353923265042008, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0029077074042461065 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.05660115935642753, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019128121580337133 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.012648534117560613, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.000714480304383431 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02199374005627044, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012054249033507543 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.013741746630537094, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0006742800088300104 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04243723633575883, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017020140807939923 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.0631413785577323, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002247650978929538 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04194122086488744, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014016615434499398 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05323619772247561, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020619540523883185 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07914684184702428, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027725917227984414 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05334942672891278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018012620317959791 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.6074700254086013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05423743996990111 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3cc280e787cfcdf46447ec8af510ab35f02550d7 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.008781294870051274, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009050499890841822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.013677582922976033, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013193791852402054 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009252067005496314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008857404763954463 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002216585419720102, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003047584752236144 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0035060227812251585, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004475043348074056 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.0023512305693387013, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002931024418143357 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.00679769925058833, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000718459101307871 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010367182111106583, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010118399277303067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.007007677157867705, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006708462523675694 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.0082009891177148, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0008441371445186326 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012916195361676434, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012503160451136067 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008671543071872297, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008282868408056053 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.0033020706801269e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.9077024795694913e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..64aae186798b5d819b40243d33e28b3e4a44f901 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.5451634050753023, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.03476122561593657 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.046590041682019065, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0018113877733116784 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.05509845941495712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0017154015403554534 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.04310290552413315, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0013354170612674893 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.0061050451406738674, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0004913565515769247 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.009015504658999519, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0006494884515366231 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.0063826724183375155, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0004428905323251431 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.04442283664669869, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0016857127584432392 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.053364978373077035, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0016220526842467798 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.041469788976882346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0012468639228984321 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.04212798093112617, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00163957458202675 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.04928251726552432, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0014753429112820023 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.03867299387707548, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0011657405780310776 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e4cca8a9ccfed473ba141c4fbf3491516685f377 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.396112958955289, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.12774001717020914 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5339579313432855, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031227264254700595 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.42523020862127436, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002981115684302531 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4479642771715969, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0023079960050490524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.248732624279182, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002509203740513705 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.1952491238459977, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0020864674211335237 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2056915755809246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0019163170514045066 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3850344789575973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00278033592883375 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3033612056952134, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002388739694886294 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3204968358680575, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001985520070286022 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4342921156865786, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003003575710929277 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.34421205733760346, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026942002219059206 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.36313254630951347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022360796971689707 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..1d2dbf5feed3f2850efad9495bf74ba58d9662bb --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 12.976405649449786, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.20113725542092992 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.558994227405181, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032295866736836163 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.44495262240037453, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029490508361979437 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.47007019927530463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022920087841303556 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.27217728286113113, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026898706377774815 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.21411207509510502, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002230097528851014 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.22591032128288588, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020560357969047136 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.40975809640656674, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.00294221380121017 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3236212819558878, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024673142471829217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3425232001087342, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020806737697165846 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.46033224287141866, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031762512099836643 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3647714098177991, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027210820647532467 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3859835303051329, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023029475516390383 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..7bf020ecbd9aa73a8dae4d882725a3ee2804fc34 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 13.841780368990428, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.16004525418920557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5672114023663154, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031836037058964135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4506315681499688, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0028976257859202057 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4775062117887383, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022520658159749783 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2831489435622744, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0026849363746315716 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2224359793863677, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022585552678834257 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23547797340215765, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.002080191282609909 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.42052513955243626, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0029794181672521524 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3316609702212396, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002482260811969057 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3522099816274127, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021360587201332483 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.4731322251491655, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031830112351273762 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3751305124701987, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027537781991579034 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.39783266320253785, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023487903214587246 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..89b10522c06aca6e8f25cd4f8d404c6762d42367 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.347043577937871, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.13183200117809515 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5657141271378652, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0032494806378829547 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4543139798474618, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002865992018425755 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4791803139648653, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0022359999842246512 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.2840374640470508, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0028091760925762137 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.22548495306647312, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002307333846171272 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.23765394178309218, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0021422406093072697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4208287732411973, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.003031588432467615 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.33645734099493246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0025176571940632108 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3551586826634256, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0021591742079548407 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.47480321305856177, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003220379872286072 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.38184109614706496, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002799589796872638 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.40243539061119715, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.002367520270615011 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f8f697084677643206c810d2b694926a8c380090 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 14.305213942543, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1483607819215848 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.5597772799865962, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0031779357829373007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.45931387100423793, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002804556836326018 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4804680058354203, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0021741397703150594 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.27941286210601324, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.002685923767684149 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2261024234770814, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002220825905817483 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.2366049201616526, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020482405764989274 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.4153388685887487, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002922026669760074 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.33991925444128, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024671212347911007 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3555529772285316, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0020868456031009723 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.470187398074112, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0031325933820836164 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3864435015330604, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027449279832689228 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.403868227449958, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0023025847428200684 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_0.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..91c8cd4d2b0bb3a1a4a14b74a748e2220c557ece --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1691487054500622, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0024250397934530475 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3329424577074328, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004657379753879945 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2135984788737725, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0026714992621488937 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03660197341061794, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013876615412254558 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07882441796533844, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029121244310315966 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04790575968435739, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0017255007150536288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.12480506419752084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0018165017981600132 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.24849031836840368, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036491823702079194 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15810158263668517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020195259778767218 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.12993544368406393, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0019145615600976785 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26077607018509813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004054503415661984 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.16520124922802173, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0022185800515451995 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.103288067343989, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12415880355418442 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_1.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..fe6b0b037333d31d7e0f97a3d9fb176661da7b41 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1369839651831878, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019614322408207006 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3369949215642884, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004522579510481593 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19244278875333556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002639391462026543 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.031094803643512716, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011365997631264868 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.07935713496492708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002884276945599807 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04413377405232099, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015985481401232032 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10657563716596334, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014875062360428326 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26467978376956175, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0036163241442920165 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1500620628581487, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0020207447957827454 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1078060619083588, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0016045831813046104 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2680535986457037, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0038953175735036904 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15187454343375792, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021906040942089064 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.7714825425476433, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10905675609895228 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_2.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..3a770523adb0b1ea1ddc8fc64454faabc86c74f8 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.1416725332039807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019003755973271465 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3459660596871756, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004306340084752496 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19861514735566918, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002536869900552261 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.032589396300244954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.001135411316234186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08273970858446308, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029435229941216076 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.046170973346933354, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016039666517335285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11131065526290164, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014510877883445626 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2741645486545069, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003497544935564585 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1563808799657367, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019620345863827896 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11082674371408649, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015845483453646742 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2737340240682109, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003854876130312793 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15581654576519338, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021586761800846295 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.8719539285791582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12363679141275902 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_3.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8c91589f95058b43b41c617b0a12473e8c105b7e --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13956778448576387, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021737270808076243 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.329102550985314, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004775853265191935 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19191359742761793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00277051626651925 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.034143138087541, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012197975952686348 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08270285997486158, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0029002084266991856 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04715930396420784, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.001624147693037141 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11089339313344634, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0016751091885532099 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26375167528125915, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0038713870137101424 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1528761624914166, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002169630539318251 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.11161318605392968, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018444411300991288 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.26547328847718554, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004239107141632789 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15380687866772882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002388364659924707 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.0369871247788307, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.11889285675918113 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_4.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..3e6a9fc67bd00757d79cdfeffe2efca8b379e77d --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.045437643598581885, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002731741164998445 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0834633850546414, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004709515538170699 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.053658040802239085, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029774150017601803 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.009714659363066126, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0009061043015811388 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.020204993259341004, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017173474943290197 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.012335864733508397, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0010450675751901856 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03525079234225019, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0021438053902596605 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06521180915166398, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003703751644624955 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.041486821630655535, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022764116642440576 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03647650806323336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.002248078408967042 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06711329846733026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003863144394106958 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04288192765446807, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023977202282267356 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 0.9532704572175751, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.1334974714932564 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_5.json b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..16f79b893023ff49ff4956eba7b8f290f036ed86 --- /dev/null +++ b/4b284b42bc4/evaluation/generation/slim.4b284b42bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.003263719197769084, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008887617448902996 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.0028345456514645845, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008116977743173961 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002997767517780879, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008334506629000642 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0004590858641913026, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00021377485443811557 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.00041953329689178745, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00021411305439143847 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0004337191943913522, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00021214389566854186 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0023145025817495005, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0006312757980824667 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.0019717134578344517, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005600175515270516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002106529594577242, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0005845294157065285 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.0026347083023874887, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0007273791404483311 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.00225801889401816, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.000653191190142516 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0024033045589119204, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006766571604709825 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 3.120169018915429e-39, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 2.777103521919484e-32 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b42bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..29ec64ff87c434ac0e67fe8d4599ec364f2c63f7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "bleu": 0.7296458163665333, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.03550662426728089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_precision": 0.073601220000616, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001145132160087058}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_recall": 0.4252463565222065, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005191805176994548}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge1_fmeasure": 0.11795106654953416, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0015900535376520654}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_precision": 0.03391186812656995, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0007058000824068418}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_recall": 0.2132946176974872, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.003991916242021659}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rouge2_fmeasure": 0.05471954734528363, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00103145574907618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_precision": 0.06738065913816185, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001053247887367618}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_recall": 0.3825929171137152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004448563155136455}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeL_fmeasure": 0.10768674313552495, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014533174914988178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_precision": 0.06988985169584011, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001094990559251859}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_recall": 0.40083936944093385, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0047880465656518125}, {"task_name": "GEM/web_nlg_en", "prompt_name": "PALM_prompt", "rougeLsum_fmeasure": 0.11184463338518444, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "3e41305c-5461-4cf3-853d-8a6fb5747623", "prompt_jinja": "I will verbalize an abstract representation of a sentence in natural language. To do so, I will first show the representation and then the natural language. The text needs to include all of the information in the representation.\n\n{{input | join(\", \")}} {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0015162341039388436}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9a6a76b4f7f5a206783c16dfbff7da949d6675b7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_precision": 0.5250547971158916, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.006526333519960469}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_recall": 0.5260187209493986, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004988544987665089}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge1_fmeasure": 0.4647781103480216, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.004790884102018922}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_precision": 0.30884166571168326, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005379997789933143}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_recall": 0.3023342622914897, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004492206468375162}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rouge2_fmeasure": 0.2679305909417252, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.004202728742347106}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_precision": 0.4329316988391663, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.005911708225945943}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_recall": 0.4338587742209103, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0046217454632362116}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeL_fmeasure": 0.3810932594090092, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004376763124965794}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_precision": 0.46578441722776126, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.006050052816419848}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_recall": 0.4662603781191038, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004711548879198356}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "rougeLsum_fmeasure": 0.4108525733234152, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004440842788632338}, {"task_name": "GEM/web_nlg_en", "prompt_name": "explicit-graph-description2", "bleu": 6.569436323858852, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "afeec167-f75f-4687-a775-1efde7d04780", "prompt_jinja": "{{input | join(\", \")}}. \n\nThe above is a set of subject | predicate | object expressions separated by commas: \nWrite all the information in proper sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.19648337614350567}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9005aa120e322447b2fa674350944705bd786c55 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "bleu": 1.5908083756467843, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.040459392268950733}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_precision": 0.16505778962329926, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0049314256147464765}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_recall": 0.5270727656034352, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004910714577670071}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge1_fmeasure": 0.2011175379818372, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0037849101593001905}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_precision": 0.09349989978109244, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0035183082522432544}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_recall": 0.29813686484545676, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004411286169203871}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rouge2_fmeasure": 0.11030533651474397, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0028381873312750285}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_precision": 0.1375823452242129, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.004312477970104036}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_recall": 0.4482402722752172, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004698309401256882}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeL_fmeasure": 0.16712441262992817, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0033343938914818156}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_precision": 0.1484650669263718, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.004512916698123371}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_recall": 0.4754461608779832, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004716833510812915}, {"task_name": "GEM/web_nlg_en", "prompt_name": "implicit-graph-description", "rougeLsum_fmeasure": 0.180675646399236, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "38342608-5cd7-4ce7-b2e1-905ecd7f4c80", "prompt_jinja": "{{input | join(\"; \")}}\nThe above is a collection of relations. Write descriptive English that contains this information.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0035123928511381465}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..61a4c65d279486872ca7048bcf0df12b07bc3255 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_precision": 0.14987304889742048, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00211774521942385}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_recall": 0.6456959062505755, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004344583145036674}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge1_fmeasure": 0.22905593642235608, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0026110238218651403}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_precision": 0.07238529962362035, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013168820567141873}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_recall": 0.3424215487119374, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.004425019552858426}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rouge2_fmeasure": 0.11213685323825111, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017667303350931108}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_precision": 0.11528840797824373, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0015945340200480952}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_recall": 0.5205446307169246, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004283561049117661}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeL_fmeasure": 0.17763538913703175, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.001990586966290515}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_precision": 0.13167907204486873, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018683868054250095}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_recall": 0.5741275519955095, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004139851361359045}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "rougeLsum_fmeasure": 0.20141307816937243, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0022952264672338745}, {"task_name": "GEM/web_nlg_en", "prompt_name": "non-explicit-description", "bleu": 2.3322798661113775, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "9415bd8a-685f-4fa4-803a-f09bd47d4603", "prompt_jinja": "I am taking this tabular data, where each row is separated by a \"|\" and expresses a relation between an object and a predicate : {{input | join(\", \")}}. \n\nNow, I will produce a description of the tabular data using English sentences. {% for i in references %}\n ||| {{ i }} \n{% endfor %}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.043126880519805975}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..07bb63989c8fe89e5774a02e94285fd56cc48342 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_precision": 0.2876720140551327, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00688536330385178}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_recall": 0.6509523532377642, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.004426354405109346}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge1_fmeasure": 0.31019840668068493, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.005210891426322614}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_precision": 0.16449661779441496, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.005024159389839176}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_recall": 0.35535074696278424, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0044950408874512215}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rouge2_fmeasure": 0.16968874155890248, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.003989086414493616}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_precision": 0.23331075033626017, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.006039029406866991}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_recall": 0.527937321532115, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.004327075413054229}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeL_fmeasure": 0.24834177180496544, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.004589491920598505}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_precision": 0.2546911797589938, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0061880736829282735}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_recall": 0.5882477978181261, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004386879252622582}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "rougeLsum_fmeasure": 0.27536819164917997, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.004661628766530461}, {"task_name": "GEM/web_nlg_en", "prompt_name": "very-explicit-description", "bleu": 3.005668095092833, "fixed_answer_choice_list": null, "dataset_path": "GEM/web_nlg", "dataset_name": "en", "subset": null, "prompt_id": "426b682e-e801-4e8d-9ac3-5b676c9d3da2", "prompt_jinja": "A semantic triple is the atomic data entity in the Resource Description Framework (RDF) data model. As its name indicates, a triple is a set of three entities that codifies a statement about semantic data in the form of subject\u2013predicate\u2013object expressions. (e.g., \"Bob | is | 35\", or \"Bob | knows | John\"). \n\nA graph can be formed from a set of these triples. An example is {{input | join(\", \")}}. \n\nWrite grammatical text expressing all the relations succinctly and fluently.\n{% for i in references %}\n ||| {{ i }} \n{% endfor %}\n\n", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.05197715337478125}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9c69e9ef54757b22b96ec8bea00aa2f3929c411f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_precision": 0.008789451068318839, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0010004545383001592}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_recall": 0.013328625710624398, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013275176482542086}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge1_fmeasure": 0.008658735255805204, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.000839492718669307}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_precision": 0.002082867589754122, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002851502666692692}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_recall": 0.003609805272035285, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005284577552828623}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rouge2_fmeasure": 0.002204071789139233, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0002691212763650804}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_precision": 0.006775055782148343, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0008305942690146914}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_recall": 0.010253688567910776, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010420446053404594}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeL_fmeasure": 0.00646696721189212, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006157556419200244}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_precision": 0.008165101675646422, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009438283367636781}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_recall": 0.01246657920584301, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.001255613374811832}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "rougeLsum_fmeasure": 0.008010216061680219, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007753360463925913}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "article_summary_en", "bleu": 3.6190370766184897e-07, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "2038df7b-5420-4a33-87ec-09715419deef", "prompt_jinja": "Article in English: {{source}}\n\nSummary in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.277437199555188e-07}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..67b21dda6377f07992213a4f9b34d3ab9f0e8ff7 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_precision": 0.0071721084411027645, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008062431739861646}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_recall": 0.011000858713299045, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011849996643443859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge1_fmeasure": 0.007360373049855638, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007652704034617401}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_precision": 0.0018535699123667382, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00029377234273353614}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_recall": 0.003118744309300992, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005028087678605357}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rouge2_fmeasure": 0.0018881229545083928, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00025433510052724046}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_precision": 0.00576293174363509, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006366250522678441}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_recall": 0.009081542967345346, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000988689403008336}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeL_fmeasure": 0.005940183094440743, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006047123448652783}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_precision": 0.006752496827809557, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007738840304642114}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_recall": 0.010255856976059748, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011143260905679664}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "rougeLsum_fmeasure": 0.006849433472012855, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007165856486605063}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "rephrase_en", "bleu": 4.4756569462780825e-08, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "753f0a46-aeff-4cd2-932c-8548897cebe5", "prompt_jinja": "{{source}}\n\nHow would you rephrase that briefly in English? ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.984205007691265e-08}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9a0cd4e73131aeaec34f7d51286ac87d02e3e29b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_precision": 0.00687892058663268, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008286135496574055}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_recall": 0.01056621052806095, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0011884533368688181}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge1_fmeasure": 0.007071687067932176, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007575078749040233}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_precision": 0.0017548186212423718, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0002898932260307108}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_recall": 0.002967002452893765, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0005390509164924998}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rouge2_fmeasure": 0.0018174950889073276, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00026787948234223484}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_precision": 0.005630373669208392, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0006824790651957477}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_recall": 0.008937401502586562, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.001025114612153835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeL_fmeasure": 0.005805102612750907, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006082156330417145}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_precision": 0.006418174596014823, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007845442619851386}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_recall": 0.009839905297862224, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0011267718061560392}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "rougeLsum_fmeasure": 0.006558475135159909, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007058336657415859}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "summarize_above_en", "bleu": 5.064264014418453e-09, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "088288f3-7516-4cf7-9406-0e082053bf54", "prompt_jinja": "{{source}}\n\n===\n\nWrite a summary of the text above in English : ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.113572869828186e-09}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f474890625e6d8e4f8f6db76ade28d4b319cd149 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_precision": 0.009018320417866373, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009515455079029407}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_recall": 0.013477068819464475, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0013177953296362858}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge1_fmeasure": 0.009010391314895649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008608910415638986}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_precision": 0.002361799926713296, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003370630501668582}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_recall": 0.0034894806830281814, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0004812458467814835}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rouge2_fmeasure": 0.002248573836077081, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00028843241279034187}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_precision": 0.007020816303734581, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007477069794008652}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_recall": 0.010607117596650792, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0010619737593296397}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeL_fmeasure": 0.006956210385184894, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0006626528079735124}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_precision": 0.008615795202441968, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0009202005223985237}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_recall": 0.012858462170759524, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0012682727385585405}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "rougeLsum_fmeasure": 0.008558581769365812, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008212318653585135}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "tldr_en", "bleu": 1.0169232766561087e-06, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "d3c5baa3-5e37-46f8-b1b2-5b834181c9da", "prompt_jinja": "{{source}}\n\nTL;DR in English: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 2.0148976704692234e-06}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9f7b41303c7e4320fb2d3c7226b6979e9040375e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_precision": 0.0023454669535560293, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.00041680548587382345}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_recall": 0.004318536854475181, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007123123450106542}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge1_fmeasure": 0.0027634263052122906, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00045380600644524696}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_precision": 0.00046366175256993565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0001167507097118896}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_recall": 0.0009755153463963649, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00027677412176346257}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rouge2_fmeasure": 0.0005725032484038239, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00014153750802328502}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_precision": 0.0017074942427303989, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000279882532535111}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_recall": 0.0033900936511171978, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005689158044979831}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeL_fmeasure": 0.0020630101397975317, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00032374717159122884}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_precision": 0.0021646208543707438, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0003758540347826323}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_recall": 0.004084932915136834, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.000675458669366651}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "rougeLsum_fmeasure": 0.002577495777518565, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0004184913516849742}, {"task_name": "GEM/wiki_lingua_en", "prompt_name": "write_abstract_en", "bleu": 3.5681280810377884e-14, "fixed_answer_choice_list": null, "dataset_path": "GEM/wiki_lingua", "dataset_name": "en", "subset": null, "prompt_id": "dff7b314-7385-4855-bb90-253073a34fde", "prompt_jinja": "First, read the English article below.\n\n{{source}} \n\nNow, please write a short abstract for it in English. ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.3433256826316355e-13}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..609a0ac6df21bd48255ca4f5b541a56ab92a1c2b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc": 0.24873737373737373, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008870224411653797}, {"task_name": "arc_easy", "prompt_name": "heres_a_problem", "acc_norm": 0.24873737373737373, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "d90da519-0e2c-4f9b-a546-7cba82824eb2", "prompt_jinja": "Here's a problem to solve: {{question}}\n\nAmong the 4 following options, which is the correct answer?\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n {% endfor %}|||{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008870224411653797}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c3eb8201454f508b141dd627943d7116213d0c6e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc": 0.31902356902356904, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009564133249441088}, {"task_name": "arc_easy", "prompt_name": "i_am_hesitating", "acc_norm": 0.3042929292929293, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "4fb13ac1-f770-45ea-b5d5-91ac50b0d609", "prompt_jinja": "I am hesitating between 4 options to answer the following question, which option should I choose?\nQuestion: {{question}}\nPossibilities:\n- {{answer_choices | join(\"\\n- \")}}|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009441202922359183}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a80d7056c4117d7b09b03c2cce18cb212ed1d88b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc": 0.3312289562289562, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009657641311350914}, {"task_name": "arc_easy", "prompt_name": "multiple_choice", "acc_norm": 0.3312289562289562, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "8c689423-880d-402b-8c7d-a1a98c7589e8", "prompt_jinja": "I gave my students this multiple choice question: {{question}}\n\nOnly one answer is correct among these 4 choices:\n- {{answer_choices | join(\"\\n- \")}}\n\nCould you tell me which one is correct?|||\n{{answer_choices[choices[\"label\"].index(answerKey)]}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009657641311350914}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..08ba35de4d495dbed890f2f6d2fe315fb6fcf7e6 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc": 0.23779461279461278, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008735850753507992}, {"task_name": "arc_easy", "prompt_name": "pick_the_most_correct_option", "acc_norm": 0.23779461279461278, "fixed_answer_choice_list": null, "dataset_path": "ai2_arc", "dataset_name": "ARC-Easy", "subset": null, "prompt_id": "033498ca-3d9a-47e3-b631-d881ab53b5ad", "prompt_jinja": "Pick the most correct option to answer the following question.\n\n{{question}}\n\nOptions:\n{% for letter, t in zip(answer_choices, choices.text) %}\n- {{letter}}: {{t}}\n{% endfor %} |||\n{{answerKey}}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008735850753507992}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..73f8f9ccf6c1f13021db15e64e842989f154c5d4 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc": 0.5743333333333334, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009028770205053254}, {"task_name": "boolq", "prompt_name": "GPT-3 Style", "acc_norm": 0.6243333333333333, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "492f0f88-4370-46cd-839b-1de37a55aeda", "prompt_jinja": "{{ passage }} \nQuestion: {{ question }}\nAnswer: ||| \n{% if label != -1 %}\n{{ answer_choices[label] }}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008843442555522137}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ed38f2568cc4ed26ef74f82c71aebf660ebc96b1 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_after_reading_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "after_reading", "acc": 0.618, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008872329987840877}, {"task_name": "boolq", "prompt_name": "after_reading", "acc_norm": 0.6156666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "3e386463-1715-4578-9cba-07d11a0d3b61", "prompt_jinja": "Passage: {{passage}}\n\nAfter reading this passage, I have a question: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008882569490543052}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..307767585ff048c11638258732266a81b749fd4b --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_exercise_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "exercise", "acc": 0.5746666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.009027853030468718}, {"task_name": "boolq", "prompt_name": "exercise", "acc_norm": 0.5613333333333334, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "9f4c6b0a-437b-40c0-b467-db4b7218d38d", "prompt_jinja": "Exercise: read the text and answer the question by True or False.\n\nText: {{passage}}\nQuestion: {{question}}? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009061278956794627}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a5863aab5ec7a44f4703cf1881b85e7bb6c44a9a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_valid_binary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "valid_binary", "acc": 0.5936666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008968593186211788}, {"task_name": "boolq", "prompt_name": "valid_binary", "acc_norm": 0.5826666666666667, "fixed_answer_choice_list": ["False", "True"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "eb78772c-e81e-4b8a-a77b-b75efd1c212a", "prompt_jinja": "{{passage}}\n\nQ: {{question}}? True or False? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.009004578551254038}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2de012729f57c8b2615c5e7768e9cf37e47b7801 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_boolq_yes_no_question_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "boolq", "prompt_name": "yes_no_question", "acc": 0.592, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_stderr": 0.008974343780026196}, {"task_name": "boolq", "prompt_name": "yes_no_question", "acc_norm": 0.615, "fixed_answer_choice_list": ["No", "Yes"], "dataset_path": "super_glue", "dataset_name": "boolq", "subset": null, "prompt_id": "7cf7acdf-e3a2-459f-a3e8-2e2d27dd6aa5", "prompt_jinja": "Text: {{passage}}\n\nAnswer the following yes/no question: {{question}}? Yes or no? |||\n{% if label != -1 %}\n{{answer_choices[label]}}\n{% endif %}", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.008885455368505631}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a56930d534e8206c6a25e9b03fb6252689fd391e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "bleu": 0.19184679423806675, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.022037762977008487}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_precision": 0.07085819095804324, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008739378619952876}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_recall": 0.12081072716164527, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.001498467805704836}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge1_fmeasure": 0.08571141422567272, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0010070427476495602}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_precision": 0.01036855856104598, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0003271837961214723}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_recall": 0.019056061372484043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0006137038599382484}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rouge2_fmeasure": 0.012899374564684497, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003990669897618305}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_precision": 0.06151421292491904, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007556149506381246}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_recall": 0.103936628362695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0012690278156809636}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeL_fmeasure": 0.07406114989234146, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0008598918508550648}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_precision": 0.06380081678271861, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007875802399240677}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_recall": 0.10747600591297043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0013011638723965141}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "coherent_text", "rougeLsum_fmeasure": 0.07676935478960281, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "bdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Please generate a restaurant description from the information given below:\n\n{{meaning_representation}} ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0008934049270425214}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d4de4e006697f0826a20670ad0970f5a871fa4c8 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "bleu": 6.587306536268369, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.10692008856583401}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_precision": 0.29207881892182114, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0016742571355490715}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_recall": 0.5488220992288727, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026711227542546754}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge1_fmeasure": 0.37337778012529416, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0017669311485306457}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_precision": 0.1312507338877233, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0011753490960191555}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_recall": 0.2551517817425887, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002309521424602609}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rouge2_fmeasure": 0.16931797936430898, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0014242506039166992}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_precision": 0.20319280490536853, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0012359159771650986}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_recall": 0.38732665353245815, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.002389449596791391}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeL_fmeasure": 0.26091928752281107, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0014097244239106089}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_precision": 0.2447499452943357, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0015559671878429087}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_recall": 0.45974120843364774, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.002585367746227488}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "create_text_for_me", "rougeLsum_fmeasure": 0.31278487914620323, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "711bcf63-be82-4937-bdef-0c379d20bb74", "prompt_jinja": "How would we create an appropriate text out of the following data?\n{{meaning_representation}}\n\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0017048680784067954}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c5fb01bb5cb60be8cf75c7a710047b30b6ffb388 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "bleu": 8.860001080746043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08032441766214476}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_precision": 0.2282401668267717, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0038058594996834105}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_recall": 0.31244080722334644, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.005069779908018149}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge1_fmeasure": 0.2511509133353019, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.003958510148281916}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_precision": 0.0975927722890742, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018020325335006712}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_recall": 0.1424180953282649, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002666484802316808}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rouge2_fmeasure": 0.11228734342841896, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0020300113350908327}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_precision": 0.15912467721798776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002881273067329202}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_recall": 0.21648776523904875, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.003670985113284094}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeL_fmeasure": 0.17297752185577242, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.002800984657281145}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_precision": 0.19047477970761165, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.003328420385981871}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_recall": 0.2591907447398561, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.004321161736207405}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_gramatically_correct_text", "rougeLsum_fmeasure": 0.2082452063786695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "0f54b6e2-42c0-45ec-8ea2-2e6204388f76", "prompt_jinja": "Combine all of the following data into a concise and grammatically correct text:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\n||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.003367977990331253}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3cce5d3eb0d4efcdeb9dd6daaabc7dd968e9a94e --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "bleu": 11.256627548432805, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.14543377736086152}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_precision": 0.4215376032960452, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0024025181710894523}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_recall": 0.48727427299265463, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.002867231297396204}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge1_fmeasure": 0.43467772454619635, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0020177075108414894}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_precision": 0.19324475626688103, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0018664258727883866}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_recall": 0.22689708066119227, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0022677822733911217}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rouge2_fmeasure": 0.19984409683367668, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0017840474360257142}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_precision": 0.3043439759839776, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.002001573450675394}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_recall": 0.3536697494352659, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0024814074442155673}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeL_fmeasure": 0.31426345805670414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.0017762603380325856}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_precision": 0.35399958676397025, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0022946023988637647}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_recall": 0.40981680597146897, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0027603392115545873}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "generate_text_restaurant", "rougeLsum_fmeasure": 0.36514922201465566, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "1acabbc3-c9b9-4624-a684-29faeccff46f", "prompt_jinja": "Given the following data about a restaurant:\n{% for feature in meaning_representation.split(\"]\") %} {% set key = feature.split(\"[\")[0].replace(\",\",\"\") %} {% set value = feature.replace(\",\",\"\").replace(key+\"[\", '''') %}\n{% if value != \"\" %} {{key}} : {{value}} {% endif %}\n{%- endfor %}\nGenerate some text about this restaurant. ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0020264847795391197}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..39216332c2e7bd130496fb0f6c6b3e7b6e7008d3 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "bleu": 6.533815973282043, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.08061056917302369}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_precision": 0.34611363968855946, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0019385334843998432}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_recall": 0.5539875876841414, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0026385933391894155}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge1_fmeasure": 0.4167090003485881, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0019025852377845915}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_precision": 0.15562713297860187, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0013970662715098196}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_recall": 0.2563150719822735, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.002317419034144347}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rouge2_fmeasure": 0.18892502275614784, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0015944430544224384}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_precision": 0.2477131986505616, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0014320235517345649}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_recall": 0.40260417204474347, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0023919020673671696}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeL_fmeasure": 0.29982493987141584, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00153168169621043}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_precision": 0.28865692255613695, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0018086881267386193}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_recall": 0.4624261352543478, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0025817101491570557}, {"task_name": "e2e_nlg_cleaned", "prompt_name": "text", "rougeLsum_fmeasure": 0.34759708152384694, "fixed_answer_choice_list": null, "dataset_path": "e2e_nlg_cleaned", "dataset_name": null, "subset": null, "prompt_id": "cdecbb5a-d3e8-46f3-9ea8-22025bc59e3b", "prompt_jinja": "Information: {{meaning_representation}}\\nDescription: ||| {{human_reference}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0018602576393587102}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..422e2c6bf57415f819d26e4d53bcc5407240628c --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0017130559457731909}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_recall": 0.0001419934412938108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 7.313843738100783e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge1_fmeasure": 0.00027204189940038993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00013974420457799956}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0017130559457731909}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_recall": 0.0001419934412938108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 7.313843738100783e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeL_fmeasure": 0.00027204189940038993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00013974420457799956}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_precision": 0.003430531732418525, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0017130559457731909}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_recall": 0.0001419934412938108, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 7.313843738100783e-05}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "rougeLsum_fmeasure": 0.00027204189940038993, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00013974420457799956}, {"task_name": "gem_xsum", "prompt_name": "DOC_boils_down_to_simple_idea_that", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "2b9c75ca-2848-4a63-b3ce-b86ea2e2d7e8", "prompt_jinja": "{{document}}\nThis boils down to the simple idea that ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..47cc4eb1639e5a306341529190de55c955225e53 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_precision": 0.0030992000829807302, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0008839473593834359}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_recall": 0.0020573512018858386, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0005662216505729439}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge1_fmeasure": 0.002434311069482336, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0006788393641812058}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_precision": 0.0004947882306372872, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00024179256073647307}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_recall": 0.00031093671858319327, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00015037977152186958}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rouge2_fmeasure": 0.0003814915603218597, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00018509478605359118}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_precision": 0.0021878495947711967, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000617526220895422}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_recall": 0.0014675923007687014, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0003957011729235515}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeL_fmeasure": 0.0017256830545079964, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00047284780236469653}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_precision": 0.002542279201420559, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0007188937748958946}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_recall": 0.0017189600670503991, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0004715986190800611}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "rougeLsum_fmeasure": 0.002018185168567718, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005599180971250053}, {"task_name": "gem_xsum", "prompt_name": "DOC_tldr", "bleu": 6.210516834436456e-43, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "db54e9b5-8ca9-4266-a773-695a3dc5bbf4", "prompt_jinja": "{{document}}\n\nTL;DR: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 9.857697644049837e-36}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..84f853c268d29ccd92cfcee3057218d605642a11 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_precision": 0.00321897943071785, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009979849053940774}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_recall": 0.002638556229987107, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0008069095304213252}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge1_fmeasure": 0.002850511566655562, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0008725813491317338}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_precision": 0.0008682722133420189, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00042068273727211144}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_recall": 0.0006912233160845813, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00036262248705394215}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rouge2_fmeasure": 0.0007583670569643345, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0003812254884717486}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_precision": 0.0024287372106315966, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.0007289443610313556}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_recall": 0.002057235735729571, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.000619733025851515}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeL_fmeasure": 0.002188808353154973, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000656190328926511}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_precision": 0.002734584249275528, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0008545960784962966}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_recall": 0.002247308506447503, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0006766768468342992}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "rougeLsum_fmeasure": 0.0024206446753818178, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0007355689947184377}, {"task_name": "gem_xsum", "prompt_name": "article_DOC_summary", "bleu": 1.531796041787971e-36, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "a8d4ecfa-c944-44d5-878c-04fd5db59e64", "prompt_jinja": "Article: {{document}}\n\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 1.3347176862997463e-30}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ddbf7d364b584f9234c47d496fbcaab791f2871a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_precision": 0.0031151406877938798, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.0009113722637675773}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_recall": 0.0025701774504185604, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.0007379642066523732}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge1_fmeasure": 0.0027618043374061404, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.0007989528623168987}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_precision": 0.0007549331911152111, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.00035399594955849874}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_recall": 0.0006187329134054994, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.00028426686679997927}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rouge2_fmeasure": 0.0006679731904905657, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.00030651440643441897}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_precision": 0.002450781251382922, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.000706141664094451}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_recall": 0.002052334856141865, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.0005894719375726362}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeL_fmeasure": 0.0021823660638719926, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.000624429684753633}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_precision": 0.002441251996570648, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.0006979496508196264}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_recall": 0.002037674464122983, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.0005805762941706561}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "rougeLsum_fmeasure": 0.002169371625491619, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.0006153356367542635}, {"task_name": "gem_xsum", "prompt_name": "summarize_DOC", "bleu": 3.921409214778822e-38, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "019726f2-7140-4ab6-a18d-a5f9cc709a47", "prompt_jinja": "Summarize: {{document}} Summary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 4.8221500497593816e-32}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..961bc73c06906a9112da9c2af4b06e70c498abef --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_precision": 0.0038593481989708405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_precision_stderr": 0.001344313928187632}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_recall": 0.0004983668868682761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_recall_stderr": 0.00017207573376097035}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge1_fmeasure": 0.0008760153533730196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge1_fmeasure_stderr": 0.00030137776057345624}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_precision": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_precision_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_recall": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_recall_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rouge2_fmeasure": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rouge2_fmeasure_stderr": 0.0}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_precision": 0.0038593481989708405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_precision_stderr": 0.001344313928187632}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_recall": 0.0004983668868682761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_recall_stderr": 0.00017207573376097035}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeL_fmeasure": 0.0008760153533730196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeL_fmeasure_stderr": 0.00030137776057345624}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_precision": 0.0038593481989708405, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_precision_stderr": 0.001344313928187632}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_recall": 0.0004983668868682761, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_recall_stderr": 0.00017207573376097035}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "rougeLsum_fmeasure": 0.0008760153533730196, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "rougeLsum_fmeasure_stderr": 0.00030137776057345624}, {"task_name": "gem_xsum", "prompt_name": "summarize_this_DOC_summary", "bleu": 0.0, "fixed_answer_choice_list": null, "dataset_path": "GEM/xsum", "dataset_name": null, "subset": "", "prompt_id": "7d3584c5-8864-4d11-bce9-65499cdef4cb", "prompt_jinja": "Summarize this document: {{document}}\nSummary: ||| {{target}}", "prompt_original_task": true, "comment": "", "bleu_stderr": 0.0}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8eda02311d44d00e7ec1316bdcc8277eaf4df76d --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "Correct the solution", "bleu": 19.222358703162943, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.8602131044597715}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_precision": 0.42346003220015105, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.00747019129990177}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_recall": 0.7252777374870985, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0062431493911134965}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge1_fmeasure": 0.4681341663071791, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.006889399966892214}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_precision": 0.33777759103417093, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.007306236635276081}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_recall": 0.5822503421119269, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.007747562507008679}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rouge2_fmeasure": 0.37510898143391946, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0070493276944622475}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_precision": 0.4152406589922532, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.007494053756311643}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_recall": 0.7105950458633749, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.0064789696983791785}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeL_fmeasure": 0.4588592688788156, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0069545443057692}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_precision": 0.41801170629839823, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.007471160098907973}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_recall": 0.715045728574649, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.006401661079614166}, {"task_name": "piqa", "prompt_name": "Correct the solution", "rougeLsum_fmeasure": 0.4619665339149606, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "5f4b4645-9438-4375-9062-083130e6d04e", "prompt_jinja": "Given a goal and a wrong solution, rewrite it to give a correct solution.\nGoal: {{goal}} \nSolution: {{[sol1, sol2][1 - label]}}\nCorrected solution:\n|||\n{{[sol1, sol2][label]}}\n", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.006920692820691468}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..46ffdb5bceee48bb49abf7d1b02f7a2cde57f03f --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "piqa", "prompt_name": "no prompt needed", "bleu": 0.11091303002767766, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "bleu_stderr": 0.006617399875391838}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_precision": 0.018909796704510137, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_precision_stderr": 0.0008256104259589896}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_recall": 0.2049696074838063, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_recall_stderr": 0.0038672492223946787}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge1_fmeasure": 0.030905539837009225, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge1_fmeasure_stderr": 0.0007455042183236098}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_precision": 0.0021211671047493192, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_precision_stderr": 0.00013098096332574767}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_recall": 0.02943306803313726, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_recall_stderr": 0.0018642034227759057}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rouge2_fmeasure": 0.003705973655257212, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rouge2_fmeasure_stderr": 0.0002060428783959972}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_precision": 0.017242460409055926, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_precision_stderr": 0.000753738906468401}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_recall": 0.19016057709929388, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_recall_stderr": 0.003594429536095365}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeL_fmeasure": 0.02825351806389626, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeL_fmeasure_stderr": 0.0006509776459012739}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_precision": 0.015471422362721683, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_precision_stderr": 0.0007269472856465039}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_recall": 0.17615413408762887, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_recall_stderr": 0.003490489946012578}, {"task_name": "piqa", "prompt_name": "no prompt needed", "rougeLsum_fmeasure": 0.025264397575809235, "fixed_answer_choice_list": null, "dataset_path": "piqa", "dataset_name": null, "subset": null, "prompt_id": "c8c45ef1-2ffc-43d7-8710-b98c2fc4f699", "prompt_jinja": "{{goal}}\n|||\n{{[sol1[0].lower() + sol1[1:], sol2[0].lower() + sol2[1:]][label]}}", "prompt_original_task": false, "comment": "", "rougeLsum_fmeasure_stderr": 0.0005776280374098223}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9088907275a193749eb7ee8cf49690e6a33068b5 --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc": 0.483, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.015810153729833434}, {"task_name": "sciq", "prompt_name": "Multiple Choice Question First", "acc_norm": 0.468, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "368e29fb-506d-4a4e-ac33-0af8d6e1729b", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nQ: {{question}}\n\n\nRead this paragraph and choose the correct option from the provided answers:\n\n{{support}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\n\nA:|||{{answer_choices[3]}}\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.015786868759359005}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_5.json b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..a68cde217233b6f8a413a76120ed85268a980c9a --- /dev/null +++ b/4b284b84bc4/eval/agg.4b284b84bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1 @@ +{"results": [{"task_name": "sciq", "prompt_name": "Multiple Choice", "acc": 0.508, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_stderr": 0.01581727492920901}, {"task_name": "sciq", "prompt_name": "Multiple Choice", "acc_norm": 0.516, "fixed_answer_choice_list": null, "dataset_path": "sciq", "dataset_name": null, "subset": null, "prompt_id": "63c22e8a-7029-4ce3-bd26-6ca6a1541563", "prompt_jinja": "{% set order = [[0, 1, 2, 3], [0, 1, 3, 2], [0, 2, 1, 3], [0, 2, 3, 1], [0, 3, 1, 2], [0, 3, 2, 1],\n [1, 0, 2, 3], [1, 0, 3, 2], [1, 2, 0, 3], [1, 2, 3, 0], [1, 3, 0, 2], [1, 3, 2, 0],\n [2, 1, 0, 3], [2, 1, 0, 2], [2, 0, 1, 3], [2, 0, 3, 1], [2, 3, 1, 0], [2, 3, 0, 1],\n [3, 1, 2, 0], [3, 1, 0, 2], [3, 2, 1, 0], [3, 2, 0, 1], [3, 0, 1, 2], [3, 0, 2, 1]] | choice %}\nAnswer the following question given this paragraph: \n\n{{support}}\n\n\nQ: {{question}}\n\n Choices:\n\n- {{ answer_choices[order[0]] }}\n\n- {{ answer_choices[order[1]] }}\n\n- {{ answer_choices[order[2]] }}\n\n- {{ answer_choices[order[3]] }}\n\nA:|||{{answer_choices[3]}}\n\n", "prompt_original_task": true, "comment": "", "acc_norm_stderr": 0.01581119837311488}], "config": {"model": "hf-causal", "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", "task_args": "", "num_fewshot": 5, "batch_size": 16, "device": "cuda", "use_cache": false, "limit": 3000, "bootstrap_iters": 10, "seed": 1234}} \ No newline at end of file diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..a615daf0064e532b6bcea91ec43ade402d44edff 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:960e829a78abfb11cabcd41d79f1415dc3b30711876f8168bf923b6540cbef73 +size 8815693 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..641955a3414b44440e3a450081f4fe78e1535776 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68191e5679a555dfac9ba89f2693da63ccacddb6e41cd6f0623f8c49fcc8e6d3 +size 6460954 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..7c98065e809d3cf65c841e14bbdb6e424fb57e0e 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d182df011f663602de6706b720d93a3ca17db3659939c0356c3c6474a467e923 +size 6952689 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3e5a7081ecdb4ec8bb1b0c627fa3f026170dd3ad 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52d07aa801d422f2fb0b5bd3b0a6fd1a826a339ec8b8a1b3f9e976ea3c638b58 +size 8234864 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl index 23f5266e88103e849eea34014a40e2993c4cd0d1..81674f8bc78d70a213549e5c16d2eeefa888b8b0 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_4.jsonl @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f5b2f52c8fefdd22617864561163534ecea1afb5c7761eae9bda0d17cedd4e2 -size 10341611 +oid sha256:57784c2bc56b696871c4b91c30b4ed2e148c0ef98515868ee6ad1ffad5bb9b71 +size 20683222 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..cf5310315134a2f677cbd36df93cb1cf30acaaa3 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af172cf2f3ee073f6210b550dcf2ecad661d28873e23fc73ab2e616486a62266 +size 11476171 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d0ed88ca44c19f2a2f3845edb610ebcdd1295eec 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b6ae232b6c41b7aabb7122322c18c0a068eaad4c5b7e0ded659a17deaf7df09 +size 35293465 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..89bf177bf46a5b03b6f18dc36a689e00461da702 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b22e5e1bfd42f1dcf4aab91b2488f0fe695e044ebfbafb0b8b9853416662408d +size 35439481 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8cc78e0b4cb03e32db7a53ccccf379837b729c92 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ce5e3b002a59f674217a178637eca6dbfe013e0ddfbb915d659bb0cd18fdeb2 +size 35581651 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..69304b3e9fd2954d90254ff08a34dfed0406c653 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746e6ac85d9378ec60e893f1c87453d064dccef38014ca2b04ef0933b74f8976 +size 34798722 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8e5dc9f1e47b58bcbbf042561c9c5fa63d325897 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b17dd15eaeed9c95f7df0b260fed120ccbb6c9486b7df382072f5ba276b513a +size 36473227 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..f98a5ffcac12acd9a9d8f3950afbcbee0adf3925 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_heres_a_problem_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1385efb1ea402e5cfb112dc95636e3ecfbe32088c26c3e28302a2b16cc1e17e4 +size 6494729 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..dfb37c1b6079b8563f64327457d09383aa9b8801 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_i_am_hesitating_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0df917744003d0d8aa97a2283338dd112a3454fa2d88d769b4aac2cd59bdcdb +size 7295312 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8686d3c6606af775d7316c36d17335f59fa54fcf 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_multiple_choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0de238f4d4d0f955ec92cf05cde2a3b2f06a97694ccdc632d9cdd54ba1cc7bc5 +size 7640167 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..e2a38c9a6f07ce2e87b186ef5283f20b3a0dfa78 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:350fe7cfa9e4860cbe4ced17d05aceaa2dd2a841fce472ed17224ec87a787903 +size 6288023 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c59f28acb20a8008111c9567c32b770bb706ff96 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_GPT-3-Style_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47af8e581da1f9faa5022f464e8fa14855f6a806c32e4c8f06fa2d17e23cd3b4 +size 13790191 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b8d461d1d3609eb9114a266db6ef91853e6b9fc6 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_after_reading_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5f7ebbb95ef872070cb686ca9c617aeb6d50a78d24a014b96246132ad323f57 +size 14969392 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..588647f1de0aaee05131ebdf09cc0cad15b7396a 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_exercise_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ebeb60fd876f2273b53ccd5419727a2869e0e71b46fd8d286b21eea57027b21 +size 15206980 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d703a5448150c01c50a6315c1503f88089eab7f8 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_valid_binary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab92a71bed0b47dc2e5e1d588f8c22395546ff3cdf1af2f0534f03fd9f9155fb +size 13853551 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d3adc46b8e7a11e1e05295591064030409a7ffc5 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_boolq_yes_no_question_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f5ca123ca78ace033af116cdc421afad397282cd037e3f3e778b743a11162e6 +size 14580394 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..2947841982dfcb716344b9a7daba955e617ac2f1 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19e745bb7a850b40f4b326c8a92cbffabf06437e58a12a503cb2ed5225a144c8 +size 8084338 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..91c30ba95b3731feb259e9a46b3d8545c4b4f0ae 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e04a4a3130a899932c4fee791d2fdd892c01aead41391f48231a811800ce43 +size 8026863 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..6e744bbee38ebcb2eaedffd05bcfecbbbcb7aeca 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba1ebdd9396e452317c11be1b90d0ce642205c4ff540648276f032ee8edff63e +size 9193511 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ca3a06c58e5e9feed2ac9e6d5af09eebb619606d 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67488cc94e4f1d85b57de82158f0244ac627731649b660f5aeec70e8edc599c8 +size 9514768 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c0b5aef83af9335a7c8e6f4d5691dcd58dceead9 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_e2e_nlg_cleaned_text_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b0c3a9b429d100d1efb4b8cf8bad642b72c8d347a4b7f507e266c3d45964bfb +size 7077015 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..8db4c9ed4a301887964ba15bec364810906f5ab2 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0c642713a2b01d40dd489f630f41f7d392d75717bfc14eeedec585bd22badf0 +size 14076798 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..29e87747047ffad1a7fbf85c5d1af86489eefafe 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_DOC_tldr_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15b79312f5831a9d0c4e7552de6587f6a1395e23ca79e851c8068ac602c40a14 +size 13794753 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..ad6d63b14429074180c7c89004aafff626224267 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_article_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:151638505e0b697ea6bf8715d46fa8bd0f580e5ceb6383c744cce01b54b0c3e9 +size 13897558 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..d3c4ed5e62c923ec8b1a4232547a9ee7b66cd5bb 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_DOC_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0764ddaf9b8154e73167bb63a59dc87e25ece38b60c309d9ae1398575dde7900 +size 13882407 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c7632a27243ad62fe37959290b448d0aeb1a3e6a 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d7b6284c48507eff4d189c2bba5d9692730f0848b6b7732c7c89159fb11081a +size 14018651 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..3ea03d3aad602407ea59ae3b6dc005e129996a11 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_Correct-the-solution_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e356c6e044fd845f2b12e9eca1885bdd083b374b7a225ed346809a22ad2cffd9 +size 5746477 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..626efdd982e48adc4c076d3377cb102b3d513f37 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_piqa_no-prompt-needed_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a33ae7a8632ef0d4e30b9bac8dbf982ff5313b3d6a82088318134984660f3d7 +size 4387027 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..bb04f2a8ed0fc5f86c6ee90b951cd6125b4eecac 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7077bf81cb206fa9c8bb10882c196271c1b883ad9fa6d34a2211ac33f2035b33 +size 5415019 diff --git a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_5.jsonl b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_5.jsonl index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..c231ac3ef99016f5c90dcbbbf5ebd3e8607612b1 100644 --- a/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_5.jsonl +++ b/4b284b84bc4/eval/examples.4b284b84bc4_sciq_Multiple-Choice_5.jsonl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1a5db242755e5cc5d31c767d5d5223f60d4c0b824580f26d69e46480b7b58e9 +size 5220564 diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..49a1dd146daefde5987477fc9521a4bf48091091 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7296458163665333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03550662426728089 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.073601220000616, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001145132160087058 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4252463565222065, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005191805176994548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11795106654953416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015900535376520654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03391186812656995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007058000824068418 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2132946176974872, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003991916242021659 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05471954734528363, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00103145574907618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06738065913816185, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001053247887367618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3825929171137152, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004448563155136455 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10768674313552495, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014533174914988178 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06988985169584011, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001094990559251859 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.40083936944093385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0047880465656518125 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11184463338518444, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015162341039388436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f43dc63d8c93a86ece5665b4fcc769bc94b25d63 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_explicit-graph-description2_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_precision": 0.5250547971158916, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.006526333519960469 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_recall": 0.5260187209493986, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004988544987665089 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge1_fmeasure": 0.4647781103480216, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.004790884102018922 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_precision": 0.30884166571168326, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005379997789933143 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_recall": 0.3023342622914897, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004492206468375162 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rouge2_fmeasure": 0.2679305909417252, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.004202728742347106 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_precision": 0.4329316988391663, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.005911708225945943 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_recall": 0.4338587742209103, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0046217454632362116 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeL_fmeasure": 0.3810932594090092, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004376763124965794 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_precision": 0.46578441722776126, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.006050052816419848 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_recall": 0.4662603781191038, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004711548879198356 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "rougeLsum_fmeasure": 0.4108525733234152, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004440842788632338 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "explicit-graph-description2", + "bleu": 6.569436323858852, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.19648337614350567 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..e2b4af1b2115a0caf2383e23793c1db34c0e13c8 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_implicit-graph-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "bleu": 1.5908083756467843, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.040459392268950733 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_precision": 0.16505778962329926, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0049314256147464765 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_recall": 0.5270727656034352, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004910714577670071 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge1_fmeasure": 0.2011175379818372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0037849101593001905 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_precision": 0.09349989978109244, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0035183082522432544 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_recall": 0.29813686484545676, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004411286169203871 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rouge2_fmeasure": 0.11030533651474397, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0028381873312750285 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_precision": 0.1375823452242129, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.004312477970104036 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_recall": 0.4482402722752172, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004698309401256882 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeL_fmeasure": 0.16712441262992817, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0033343938914818156 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_precision": 0.1484650669263718, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.004512916698123371 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_recall": 0.4754461608779832, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004716833510812915 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "implicit-graph-description", + "rougeLsum_fmeasure": 0.180675646399236, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0035123928511381465 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..fee6adf43109e4663f035aa769fafe031950b8b8 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_non-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_precision": 0.14987304889742048, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00211774521942385 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_recall": 0.6456959062505755, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004344583145036674 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge1_fmeasure": 0.22905593642235608, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0026110238218651403 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_precision": 0.07238529962362035, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0013168820567141873 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_recall": 0.3424215487119374, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.004425019552858426 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rouge2_fmeasure": 0.11213685323825111, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0017667303350931108 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_precision": 0.11528840797824373, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0015945340200480952 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_recall": 0.5205446307169246, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004283561049117661 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeL_fmeasure": 0.17763538913703175, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001990586966290515 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_precision": 0.13167907204486873, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0018683868054250095 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_recall": 0.5741275519955095, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004139851361359045 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "rougeLsum_fmeasure": 0.20141307816937243, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0022952264672338745 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "non-explicit-description", + "bleu": 2.3322798661113775, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.043126880519805975 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.json new file mode 100644 index 0000000000000000000000000000000000000000..c3aed4e60668b340a84b15b048ad08add8150983 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-web_nlg_en_very-explicit-description_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_precision": 0.2876720140551327, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00688536330385178 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_recall": 0.6509523532377642, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004426354405109346 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge1_fmeasure": 0.31019840668068493, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.005210891426322614 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_precision": 0.16449661779441496, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.005024159389839176 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_recall": 0.35535074696278424, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0044950408874512215 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rouge2_fmeasure": 0.16968874155890248, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.003989086414493616 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_precision": 0.23331075033626017, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.006039029406866991 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_recall": 0.527937321532115, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004327075413054229 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeL_fmeasure": 0.24834177180496544, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.004589491920598505 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_precision": 0.2546911797589938, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0061880736829282735 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_recall": 0.5882477978181261, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004386879252622582 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "rougeLsum_fmeasure": 0.27536819164917997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.004661628766530461 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "very-explicit-description", + "bleu": 3.005668095092833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.05197715337478125 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..34a3d9e7825ffbaed115b7a21acd1b64bf59adb1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_article_summary_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_precision": 0.008789451068318839, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0010004545383001592 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_recall": 0.013328625710624398, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013275176482542086 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge1_fmeasure": 0.008658735255805204, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.000839492718669307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_precision": 0.002082867589754122, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002851502666692692 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_recall": 0.003609805272035285, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005284577552828623 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rouge2_fmeasure": 0.002204071789139233, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0002691212763650804 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_precision": 0.006775055782148343, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0008305942690146914 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_recall": 0.010253688567910776, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010420446053404594 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeL_fmeasure": 0.00646696721189212, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006157556419200244 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_precision": 0.008165101675646422, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009438283367636781 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_recall": 0.01246657920584301, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.001255613374811832 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "rougeLsum_fmeasure": 0.008010216061680219, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007753360463925913 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "article_summary_en", + "bleu": 3.6190370766184897e-07, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 9.277437199555188e-07 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b620cd88c22e8149c76d25ef858b1b188e3e324b --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_rephrase_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_precision": 0.0071721084411027645, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008062431739861646 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_recall": 0.011000858713299045, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011849996643443859 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge1_fmeasure": 0.007360373049855638, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007652704034617401 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_precision": 0.0018535699123667382, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.00029377234273353614 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_recall": 0.003118744309300992, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005028087678605357 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rouge2_fmeasure": 0.0018881229545083928, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00025433510052724046 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_precision": 0.00576293174363509, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006366250522678441 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_recall": 0.009081542967345346, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.000988689403008336 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeL_fmeasure": 0.005940183094440743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006047123448652783 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_precision": 0.006752496827809557, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007738840304642114 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_recall": 0.010255856976059748, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011143260905679664 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "rougeLsum_fmeasure": 0.006849433472012855, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007165856486605063 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "rephrase_en", + "bleu": 4.4756569462780825e-08, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 9.984205007691265e-08 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5558b0fdd534370f20ffd04a7a0ff8c4761c8387 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_summarize_above_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_precision": 0.00687892058663268, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0008286135496574055 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_recall": 0.01056621052806095, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0011884533368688181 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge1_fmeasure": 0.007071687067932176, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0007575078749040233 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_precision": 0.0017548186212423718, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0002898932260307108 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_recall": 0.002967002452893765, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0005390509164924998 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rouge2_fmeasure": 0.0018174950889073276, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00026787948234223484 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_precision": 0.005630373669208392, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0006824790651957477 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_recall": 0.008937401502586562, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.001025114612153835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeL_fmeasure": 0.005805102612750907, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006082156330417145 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_precision": 0.006418174596014823, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0007845442619851386 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_recall": 0.009839905297862224, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0011267718061560392 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "rougeLsum_fmeasure": 0.006558475135159909, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0007058336657415859 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "summarize_above_en", + "bleu": 5.064264014418453e-09, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 9.113572869828186e-09 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6eb03e7cdec54e00388c44e391a830838dd4cd89 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.009018320417866373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009515455079029407 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.013477068819464475, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013177953296362858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009010391314895649, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008608910415638986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002361799926713296, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003370630501668582 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0034894806830281814, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004812458467814835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002248573836077081, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00028843241279034187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.007020816303734581, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007477069794008652 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010607117596650792, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010619737593296397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006956210385184894, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006626528079735124 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.008615795202441968, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009202005223985237 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012858462170759524, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012682727385585405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008558581769365812, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008212318653585135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.0169232766561087e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.0148976704692234e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3bc22599036044d79d8409405c1061c0d6cdc1fb --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_GEM-wiki_lingua_en_write_abstract_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_precision": 0.0023454669535560293, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.00041680548587382345 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_recall": 0.004318536854475181, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0007123123450106542 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge1_fmeasure": 0.0027634263052122906, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.00045380600644524696 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_precision": 0.00046366175256993565, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0001167507097118896 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_recall": 0.0009755153463963649, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.00027677412176346257 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rouge2_fmeasure": 0.0005725032484038239, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00014153750802328502 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_precision": 0.0017074942427303989, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.000279882532535111 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_recall": 0.0033900936511171978, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0005689158044979831 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeL_fmeasure": 0.0020630101397975317, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.00032374717159122884 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_precision": 0.0021646208543707438, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0003758540347826323 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_recall": 0.004084932915136834, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.000675458669366651 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "rougeLsum_fmeasure": 0.002577495777518565, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0004184913516849742 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "write_abstract_en", + "bleu": 3.5681280810377884e-14, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 1.3433256826316355e-13 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8497e72bc794445a5faf09f7b55f443da050becf --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_heres_a_problem_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc": 0.24873737373737373, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008870224411653797 + }, + { + "task_name": "arc_easy", + "prompt_name": "heres_a_problem", + "acc_norm": 0.24873737373737373, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008870224411653797 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_5.json new file mode 100644 index 0000000000000000000000000000000000000000..209aeebfa81dfa472e6d0151dd1589fd07266c9f --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_i_am_hesitating_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc": 0.31902356902356904, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009564133249441088 + }, + { + "task_name": "arc_easy", + "prompt_name": "i_am_hesitating", + "acc_norm": 0.3042929292929293, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009441202922359183 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..11da1f0ea39f9cc11a775804ec1e129522ac56b0 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_multiple_choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc": 0.3312289562289562, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.009657641311350914 + }, + { + "task_name": "arc_easy", + "prompt_name": "multiple_choice", + "acc_norm": 0.3312289562289562, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.009657641311350914 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.json new file mode 100644 index 0000000000000000000000000000000000000000..93371995dc4b68e5d51fd9a4d14e559597aa0618 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_arc_easy_pick_the_most_correct_option_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc": 0.23779461279461278, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_stderr": 0.008735850753507992 + }, + { + "task_name": "arc_easy", + "prompt_name": "pick_the_most_correct_option", + "acc_norm": 0.23779461279461278, + "dataset_path": "ai2_arc", + "dataset_name": "ARC-Easy", + "subset": null, + "acc_norm_stderr": 0.008735850753507992 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_5.json new file mode 100644 index 0000000000000000000000000000000000000000..2d389a3c19c26a8040642ac0be5fb9cdc7d016a3 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_GPT-3-Style_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc": 0.5743333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009028770205053254 + }, + { + "task_name": "boolq", + "prompt_name": "GPT-3 Style", + "acc_norm": 0.6243333333333333, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008843442555522137 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8fef0ca1bf4a097c5985ac951a2042bd774deed1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_after_reading_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc": 0.618, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008872329987840877 + }, + { + "task_name": "boolq", + "prompt_name": "after_reading", + "acc_norm": 0.6156666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008882569490543052 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_5.json new file mode 100644 index 0000000000000000000000000000000000000000..b2fb6585f6621761e90799e5820b600d3139da0a --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_exercise_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc": 0.5746666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.009027853030468718 + }, + { + "task_name": "boolq", + "prompt_name": "exercise", + "acc_norm": 0.5613333333333334, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009061278956794627 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1d622b7b1d07de3bf16784c864727348e67753d6 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_valid_binary_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc": 0.5936666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008968593186211788 + }, + { + "task_name": "boolq", + "prompt_name": "valid_binary", + "acc_norm": 0.5826666666666667, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.009004578551254038 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_5.json new file mode 100644 index 0000000000000000000000000000000000000000..ea9d9990b4a6854c181cb769c00f6b772ce85ef7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_boolq_yes_no_question_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc": 0.592, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_stderr": 0.008974343780026196 + }, + { + "task_name": "boolq", + "prompt_name": "yes_no_question", + "acc_norm": 0.615, + "dataset_path": "super_glue", + "dataset_name": "boolq", + "subset": null, + "acc_norm_stderr": 0.008885455368505631 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..98a4c8a4ceb9d178e1634d12a0d62d87f47334b4 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_coherent_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "bleu": 0.19184679423806675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.022037762977008487 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_precision": 0.07085819095804324, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0008739378619952876 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_recall": 0.12081072716164527, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.001498467805704836 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge1_fmeasure": 0.08571141422567272, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0010070427476495602 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_precision": 0.01036855856104598, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0003271837961214723 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_recall": 0.019056061372484043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0006137038599382484 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rouge2_fmeasure": 0.012899374564684497, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0003990669897618305 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_precision": 0.06151421292491904, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0007556149506381246 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_recall": 0.103936628362695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0012690278156809636 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeL_fmeasure": 0.07406114989234146, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0008598918508550648 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_precision": 0.06380081678271861, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0007875802399240677 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_recall": 0.10747600591297043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0013011638723965141 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "coherent_text", + "rougeLsum_fmeasure": 0.07676935478960281, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008934049270425214 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.json new file mode 100644 index 0000000000000000000000000000000000000000..f948f6eed2c4ea594366b6e59bd5528a530dcbf7 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_create_text_for_me_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "bleu": 6.587306536268369, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.10692008856583401 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_precision": 0.29207881892182114, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0016742571355490715 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_recall": 0.5488220992288727, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026711227542546754 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge1_fmeasure": 0.37337778012529416, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0017669311485306457 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_precision": 0.1312507338877233, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0011753490960191555 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_recall": 0.2551517817425887, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002309521424602609 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rouge2_fmeasure": 0.16931797936430898, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0014242506039166992 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_precision": 0.20319280490536853, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0012359159771650986 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_recall": 0.38732665353245815, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.002389449596791391 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeL_fmeasure": 0.26091928752281107, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0014097244239106089 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_precision": 0.2447499452943357, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0015559671878429087 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_recall": 0.45974120843364774, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002585367746227488 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "create_text_for_me", + "rougeLsum_fmeasure": 0.31278487914620323, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017048680784067954 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..54f8b5510b0345c5f880879a9917a113bf20b7ea --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_gramatically_correct_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "bleu": 8.860001080746043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08032441766214476 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_precision": 0.2282401668267717, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0038058594996834105 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_recall": 0.31244080722334644, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.005069779908018149 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge1_fmeasure": 0.2511509133353019, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.003958510148281916 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_precision": 0.0975927722890742, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018020325335006712 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_recall": 0.1424180953282649, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002666484802316808 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rouge2_fmeasure": 0.11228734342841896, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0020300113350908327 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_precision": 0.15912467721798776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002881273067329202 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_recall": 0.21648776523904875, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003670985113284094 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeL_fmeasure": 0.17297752185577242, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.002800984657281145 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_precision": 0.19047477970761165, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.003328420385981871 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_recall": 0.2591907447398561, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.004321161736207405 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_gramatically_correct_text", + "rougeLsum_fmeasure": 0.2082452063786695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.003367977990331253 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5977eb6846791476ee3c98acc5ecfae4eaac5b22 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.256627548432805, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14543377736086152 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4215376032960452, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0024025181710894523 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48727427299265463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002867231297396204 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.43467772454619635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020177075108414894 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.19324475626688103, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018664258727883866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.22689708066119227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022677822733911217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19984409683367668, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017840474360257142 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3043439759839776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002001573450675394 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3536697494352659, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024814074442155673 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.31426345805670414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017762603380325856 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.35399958676397025, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0022946023988637647 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40981680597146897, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027603392115545873 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.36514922201465566, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020264847795391197 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_5.json new file mode 100644 index 0000000000000000000000000000000000000000..8fd9f0ec4b0582f2f93c91abeebc2f38b7d96235 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_e2e_nlg_cleaned_text_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "bleu": 6.533815973282043, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.08061056917302369 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_precision": 0.34611363968855946, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0019385334843998432 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_recall": 0.5539875876841414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0026385933391894155 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge1_fmeasure": 0.4167090003485881, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019025852377845915 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_precision": 0.15562713297860187, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0013970662715098196 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_recall": 0.2563150719822735, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002317419034144347 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rouge2_fmeasure": 0.18892502275614784, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0015944430544224384 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_precision": 0.2477131986505616, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0014320235517345649 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_recall": 0.40260417204474347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023919020673671696 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeL_fmeasure": 0.29982493987141584, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.00153168169621043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_precision": 0.28865692255613695, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0018086881267386193 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_recall": 0.4624261352543478, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0025817101491570557 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "text", + "rougeLsum_fmeasure": 0.34759708152384694, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018602576393587102 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json new file mode 100644 index 0000000000000000000000000000000000000000..9b0f0a33a76160d960d2f56f24874120868c8f08 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_boils_down_to_simple_idea_that_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0017130559457731909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_recall": 0.0001419934412938108, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 7.313843738100783e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge1_fmeasure": 0.00027204189940038993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00013974420457799956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017130559457731909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_recall": 0.0001419934412938108, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 7.313843738100783e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeL_fmeasure": 0.00027204189940038993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00013974420457799956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_precision": 0.003430531732418525, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0017130559457731909 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_recall": 0.0001419934412938108, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 7.313843738100783e-05 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "rougeLsum_fmeasure": 0.00027204189940038993, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00013974420457799956 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_boils_down_to_simple_idea_that", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_5.json new file mode 100644 index 0000000000000000000000000000000000000000..99b680e245fb34e2ceea3d1e3fb2df1006192dc5 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_DOC_tldr_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_precision": 0.0030992000829807302, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0008839473593834359 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_recall": 0.0020573512018858386, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0005662216505729439 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge1_fmeasure": 0.002434311069482336, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0006788393641812058 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_precision": 0.0004947882306372872, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00024179256073647307 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_recall": 0.00031093671858319327, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00015037977152186958 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rouge2_fmeasure": 0.0003814915603218597, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00018509478605359118 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_precision": 0.0021878495947711967, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000617526220895422 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_recall": 0.0014675923007687014, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0003957011729235515 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeL_fmeasure": 0.0017256830545079964, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00047284780236469653 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_precision": 0.002542279201420559, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0007188937748958946 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_recall": 0.0017189600670503991, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0004715986190800611 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "rougeLsum_fmeasure": 0.002018185168567718, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0005599180971250053 + }, + { + "task_name": "gem_xsum", + "prompt_name": "DOC_tldr", + "bleu": 6.210516834436456e-43, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 9.857697644049837e-36 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..228cbb4b58edba11902b2b79d8b4585b334bc8b4 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.00321897943071785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009979849053940774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002638556229987107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008069095304213252 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002850511566655562, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008725813491317338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0008682722133420189, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00042068273727211144 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0006912233160845813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00036262248705394215 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0007583670569643345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0003812254884717486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0024287372106315966, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0007289443610313556 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.002057235735729571, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.000619733025851515 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002188808353154973, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000656190328926511 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002734584249275528, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008545960784962966 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.002247308506447503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006766768468342992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0024206446753818178, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0007355689947184377 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.531796041787971e-36, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.3347176862997463e-30 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_5.json new file mode 100644 index 0000000000000000000000000000000000000000..73e53726af9f7c00c4930e00e39249d58a020fad --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_DOC_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_precision": 0.0031151406877938798, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009113722637675773 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_recall": 0.0025701774504185604, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0007379642066523732 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge1_fmeasure": 0.0027618043374061404, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0007989528623168987 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_precision": 0.0007549331911152111, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00035399594955849874 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_recall": 0.0006187329134054994, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00028426686679997927 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rouge2_fmeasure": 0.0006679731904905657, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.00030651440643441897 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_precision": 0.002450781251382922, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.000706141664094451 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_recall": 0.002052334856141865, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0005894719375726362 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeL_fmeasure": 0.0021823660638719926, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000624429684753633 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_precision": 0.002441251996570648, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0006979496508196264 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_recall": 0.002037674464122983, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0005805762941706561 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "rougeLsum_fmeasure": 0.002169371625491619, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0006153356367542635 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_DOC", + "bleu": 3.921409214778822e-38, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 4.8221500497593816e-32 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..1ef5763cbc924e7e6e0cb177ceb07afb0356cdb1 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_gem_xsum_summarize_this_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_precision": 0.0038593481989708405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.001344313928187632 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_recall": 0.0004983668868682761, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00017207573376097035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge1_fmeasure": 0.0008760153533730196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.00030137776057345624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_precision": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_recall": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rouge2_fmeasure": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_precision": 0.0038593481989708405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001344313928187632 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_recall": 0.0004983668868682761, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.00017207573376097035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeL_fmeasure": 0.0008760153533730196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.00030137776057345624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_precision": 0.0038593481989708405, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.001344313928187632 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_recall": 0.0004983668868682761, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.00017207573376097035 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "rougeLsum_fmeasure": 0.0008760153533730196, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.00030137776057345624 + }, + { + "task_name": "gem_xsum", + "prompt_name": "summarize_this_DOC_summary", + "bleu": 0.0, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.0 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_5.json new file mode 100644 index 0000000000000000000000000000000000000000..d56a2a5c4097db83e07b806027bc4f3cba7cd170 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_Correct-the-solution_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "bleu": 19.222358703162943, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.8602131044597715 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_precision": 0.42346003220015105, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.00747019129990177 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_recall": 0.7252777374870985, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0062431493911134965 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge1_fmeasure": 0.4681341663071791, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.006889399966892214 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_precision": 0.33777759103417093, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.007306236635276081 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_recall": 0.5822503421119269, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.007747562507008679 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rouge2_fmeasure": 0.37510898143391946, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0070493276944622475 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_precision": 0.4152406589922532, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.007494053756311643 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_recall": 0.7105950458633749, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0064789696983791785 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeL_fmeasure": 0.4588592688788156, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0069545443057692 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_precision": 0.41801170629839823, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.007471160098907973 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_recall": 0.715045728574649, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.006401661079614166 + }, + { + "task_name": "piqa", + "prompt_name": "Correct the solution", + "rougeLsum_fmeasure": 0.4619665339149606, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.006920692820691468 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_5.json new file mode 100644 index 0000000000000000000000000000000000000000..482f5f86f651b0f4131c4e150e19d6ad0e7e6862 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_piqa_no-prompt-needed_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "bleu": 0.11091303002767766, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.006617399875391838 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_precision": 0.018909796704510137, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0008256104259589896 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_recall": 0.2049696074838063, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0038672492223946787 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge1_fmeasure": 0.030905539837009225, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0007455042183236098 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_precision": 0.0021211671047493192, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.00013098096332574767 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_recall": 0.02943306803313726, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0018642034227759057 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rouge2_fmeasure": 0.003705973655257212, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0002060428783959972 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_precision": 0.017242460409055926, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.000753738906468401 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_recall": 0.19016057709929388, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.003594429536095365 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeL_fmeasure": 0.02825351806389626, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0006509776459012739 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_precision": 0.015471422362721683, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0007269472856465039 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_recall": 0.17615413408762887, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.003490489946012578 + }, + { + "task_name": "piqa", + "prompt_name": "no prompt needed", + "rougeLsum_fmeasure": 0.025264397575809235, + "dataset_path": "piqa", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0005776280374098223 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.json new file mode 100644 index 0000000000000000000000000000000000000000..055fd0f62edcefea69f059f2b7900461857bb086 --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice-Question-First_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc": 0.483, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.015810153729833434 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice Question First", + "acc_norm": 0.468, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.015786868759359005 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_5.json b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_5.json new file mode 100644 index 0000000000000000000000000000000000000000..3d474618479147c4c627ac47427e760f62ec8caf --- /dev/null +++ b/4b284b84bc4/eval/slim.4b284b84bc4_sciq_Multiple-Choice_5.json @@ -0,0 +1,34 @@ +{ + "results": [ + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc": 0.508, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_stderr": 0.01581727492920901 + }, + { + "task_name": "sciq", + "prompt_name": "Multiple Choice", + "acc_norm": 0.516, + "dataset_path": "sciq", + "dataset_name": null, + "subset": null, + "acc_norm_stderr": 0.01581119837311488 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_0.json b/4b284b84bc4/evaluation/4b284b84bc4_0.json new file mode 100644 index 0000000000000000000000000000000000000000..60887b36bbef7b9e621d614c73001ba74c4ac203 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_0.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.328, + "acc_stderr": 0.014853842487270334 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.014955087918653605 + }, + "anli_r3": { + "acc": 0.32416666666666666, + "acc_stderr": 0.013517438120881624 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957636, + "f1": 0.18656056587091072 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283 + }, + "hellaswag": { + "acc": 0.4547898824935272, + "acc_stderr": 0.004969341773423513, + "acc_norm": 0.5937064329814777, + "acc_norm_stderr": 0.004901368629533419 + }, + "rte": { + "acc": 0.5595667870036101, + "acc_stderr": 0.029882123363118726 + }, + "winogrande": { + "acc": 0.5769534333070244, + "acc_stderr": 0.01388505535905647 + }, + "storycloze_2016": { + "acc": 0.694815606627472, + "acc_stderr": 0.010648664383985661 + }, + "boolq": { + "acc": 0.6256880733944954, + "acc_stderr": 0.00846424665644323 + }, + "arc_easy": { + "acc": 0.40614478114478114, + "acc_stderr": 0.010077409815364048, + "acc_norm": 0.3766835016835017, + "acc_norm_stderr": 0.009942848077476172 + }, + "arc_challenge": { + "acc": 0.20648464163822525, + "acc_stderr": 0.011828865619002316, + "acc_norm": 0.2551194539249147, + "acc_norm_stderr": 0.012739038695202109 + }, + "sciq": { + "acc": 0.775, + "acc_stderr": 0.013211720158614756, + "acc_norm": 0.709, + "acc_norm_stderr": 0.014370995982377933 + }, + "piqa": { + "acc": 0.6561479869423286, + "acc_stderr": 0.011082356277961393, + "acc_norm": 0.6528835690968444, + "acc_norm_stderr": 0.011107104993128086 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..60887b36bbef7b9e621d614c73001ba74c4ac203 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_0_lm-eval_global_step80108_2023-01-30-11-26-40_0shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.328, + "acc_stderr": 0.014853842487270334 + }, + "anli_r2": { + "acc": 0.337, + "acc_stderr": 0.014955087918653605 + }, + "anli_r3": { + "acc": 0.32416666666666666, + "acc_stderr": 0.013517438120881624 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957636, + "f1": 0.18656056587091072 + }, + "copa": { + "acc": 0.76, + "acc_stderr": 0.04292346959909283 + }, + "hellaswag": { + "acc": 0.4547898824935272, + "acc_stderr": 0.004969341773423513, + "acc_norm": 0.5937064329814777, + "acc_norm_stderr": 0.004901368629533419 + }, + "rte": { + "acc": 0.5595667870036101, + "acc_stderr": 0.029882123363118726 + }, + "winogrande": { + "acc": 0.5769534333070244, + "acc_stderr": 0.01388505535905647 + }, + "storycloze_2016": { + "acc": 0.694815606627472, + "acc_stderr": 0.010648664383985661 + }, + "boolq": { + "acc": 0.6256880733944954, + "acc_stderr": 0.00846424665644323 + }, + "arc_easy": { + "acc": 0.40614478114478114, + "acc_stderr": 0.010077409815364048, + "acc_norm": 0.3766835016835017, + "acc_norm_stderr": 0.009942848077476172 + }, + "arc_challenge": { + "acc": 0.20648464163822525, + "acc_stderr": 0.011828865619002316, + "acc_norm": 0.2551194539249147, + "acc_norm_stderr": 0.012739038695202109 + }, + "sciq": { + "acc": 0.775, + "acc_stderr": 0.013211720158614756, + "acc_norm": 0.709, + "acc_norm_stderr": 0.014370995982377933 + }, + "piqa": { + "acc": 0.6561479869423286, + "acc_stderr": 0.011082356277961393, + "acc_norm": 0.6528835690968444, + "acc_norm_stderr": 0.011107104993128086 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_1.json b/4b284b84bc4/evaluation/4b284b84bc4_1.json new file mode 100644 index 0000000000000000000000000000000000000000..e9793bbf60e94c24a7522330ca0203189bfa0f7f --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_1.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.303, + "acc_stderr": 0.014539683710535264 + }, + "anli_r2": { + "acc": 0.312, + "acc_stderr": 0.01465847437050901 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077247 + }, + "cb": { + "acc": 0.30357142857142855, + "acc_stderr": 0.06199938655510753, + "f1": 0.2927120669056153 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4506074487153953, + "acc_stderr": 0.0049653753416431376, + "acc_norm": 0.5834495120493925, + "acc_norm_stderr": 0.004919794704673269 + }, + "rte": { + "acc": 0.5342960288808665, + "acc_stderr": 0.030025579819366422 + }, + "winogrande": { + "acc": 0.6077348066298343, + "acc_stderr": 0.013722400462000883 + }, + "storycloze_2016": { + "acc": 0.6996258685195083, + "acc_stderr": 0.010600915927985033 + }, + "boolq": { + "acc": 0.6134556574923548, + "acc_stderr": 0.008516943934341973 + }, + "arc_easy": { + "acc": 0.5231481481481481, + "acc_stderr": 0.010248782484554473, + "acc_norm": 0.4819023569023569, + "acc_norm_stderr": 0.010253060653479177 + }, + "arc_challenge": { + "acc": 0.23122866894197952, + "acc_stderr": 0.012320858834772273, + "acc_norm": 0.2619453924914676, + "acc_norm_stderr": 0.012849054826858115 + }, + "sciq": { + "acc": 0.88, + "acc_stderr": 0.010281328012747391, + "acc_norm": 0.863, + "acc_norm_stderr": 0.010878848714333327 + }, + "piqa": { + "acc": 0.6887921653971708, + "acc_stderr": 0.010802263878045844, + "acc_norm": 0.6866158868335147, + "acc_norm_stderr": 0.010822829929195489 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_1_lm-eval_global_step80108_2023-01-30-11-26-40_1shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_1_lm-eval_global_step80108_2023-01-30-11-26-40_1shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..e9793bbf60e94c24a7522330ca0203189bfa0f7f --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_1_lm-eval_global_step80108_2023-01-30-11-26-40_1shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.303, + "acc_stderr": 0.014539683710535264 + }, + "anli_r2": { + "acc": 0.312, + "acc_stderr": 0.01465847437050901 + }, + "anli_r3": { + "acc": 0.3491666666666667, + "acc_stderr": 0.013767075395077247 + }, + "cb": { + "acc": 0.30357142857142855, + "acc_stderr": 0.06199938655510753, + "f1": 0.2927120669056153 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4506074487153953, + "acc_stderr": 0.0049653753416431376, + "acc_norm": 0.5834495120493925, + "acc_norm_stderr": 0.004919794704673269 + }, + "rte": { + "acc": 0.5342960288808665, + "acc_stderr": 0.030025579819366422 + }, + "winogrande": { + "acc": 0.6077348066298343, + "acc_stderr": 0.013722400462000883 + }, + "storycloze_2016": { + "acc": 0.6996258685195083, + "acc_stderr": 0.010600915927985033 + }, + "boolq": { + "acc": 0.6134556574923548, + "acc_stderr": 0.008516943934341973 + }, + "arc_easy": { + "acc": 0.5231481481481481, + "acc_stderr": 0.010248782484554473, + "acc_norm": 0.4819023569023569, + "acc_norm_stderr": 0.010253060653479177 + }, + "arc_challenge": { + "acc": 0.23122866894197952, + "acc_stderr": 0.012320858834772273, + "acc_norm": 0.2619453924914676, + "acc_norm_stderr": 0.012849054826858115 + }, + "sciq": { + "acc": 0.88, + "acc_stderr": 0.010281328012747391, + "acc_norm": 0.863, + "acc_norm_stderr": 0.010878848714333327 + }, + "piqa": { + "acc": 0.6887921653971708, + "acc_stderr": 0.010802263878045844, + "acc_norm": 0.6866158868335147, + "acc_norm_stderr": 0.010822829929195489 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_2.json b/4b284b84bc4/evaluation/4b284b84bc4_2.json new file mode 100644 index 0000000000000000000000000000000000000000..de027da955b41777a653f058d2a2f0aa5b6d47b6 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_2.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.315, + "acc_stderr": 0.014696631960792498 + }, + "anli_r2": { + "acc": 0.341, + "acc_stderr": 0.014998131348402702 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.0136804957257678 + }, + "cb": { + "acc": 0.14285714285714285, + "acc_stderr": 0.04718416136255829, + "f1": 0.143010752688172 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4473212507468632, + "acc_stderr": 0.004962010338226347, + "acc_norm": 0.5848436566421031, + "acc_norm_stderr": 0.0049174193677660296 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.0300523034631437 + }, + "winogrande": { + "acc": 0.601420678768745, + "acc_stderr": 0.013760357176873838 + }, + "storycloze_2016": { + "acc": 0.7156600748262961, + "acc_stderr": 0.01043161412866526 + }, + "boolq": { + "acc": 0.6119266055045871, + "acc_stderr": 0.008523130584760851 + }, + "arc_easy": { + "acc": 0.5593434343434344, + "acc_stderr": 0.010187264635711983, + "acc_norm": 0.5298821548821548, + "acc_norm_stderr": 0.010241444322886432 + }, + "arc_challenge": { + "acc": 0.2636518771331058, + "acc_stderr": 0.01287592915129705, + "acc_norm": 0.2858361774744027, + "acc_norm_stderr": 0.013203196088537369 + }, + "sciq": { + "acc": 0.906, + "acc_stderr": 0.009233052000787738, + "acc_norm": 0.902, + "acc_norm_stderr": 0.009406619184621226 + }, + "piqa": { + "acc": 0.7089227421109902, + "acc_stderr": 0.010598612490942586, + "acc_norm": 0.7143634385201306, + "acc_norm_stderr": 0.010539303948661916 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_2_lm-eval_global_step80108_2023-01-30-11-26-40_2shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_2_lm-eval_global_step80108_2023-01-30-11-26-40_2shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..de027da955b41777a653f058d2a2f0aa5b6d47b6 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_2_lm-eval_global_step80108_2023-01-30-11-26-40_2shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.315, + "acc_stderr": 0.014696631960792498 + }, + "anli_r2": { + "acc": 0.341, + "acc_stderr": 0.014998131348402702 + }, + "anli_r3": { + "acc": 0.34, + "acc_stderr": 0.0136804957257678 + }, + "cb": { + "acc": 0.14285714285714285, + "acc_stderr": 0.04718416136255829, + "f1": 0.143010752688172 + }, + "copa": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "hellaswag": { + "acc": 0.4473212507468632, + "acc_stderr": 0.004962010338226347, + "acc_norm": 0.5848436566421031, + "acc_norm_stderr": 0.0049174193677660296 + }, + "rte": { + "acc": 0.4729241877256318, + "acc_stderr": 0.0300523034631437 + }, + "winogrande": { + "acc": 0.601420678768745, + "acc_stderr": 0.013760357176873838 + }, + "storycloze_2016": { + "acc": 0.7156600748262961, + "acc_stderr": 0.01043161412866526 + }, + "boolq": { + "acc": 0.6119266055045871, + "acc_stderr": 0.008523130584760851 + }, + "arc_easy": { + "acc": 0.5593434343434344, + "acc_stderr": 0.010187264635711983, + "acc_norm": 0.5298821548821548, + "acc_norm_stderr": 0.010241444322886432 + }, + "arc_challenge": { + "acc": 0.2636518771331058, + "acc_stderr": 0.01287592915129705, + "acc_norm": 0.2858361774744027, + "acc_norm_stderr": 0.013203196088537369 + }, + "sciq": { + "acc": 0.906, + "acc_stderr": 0.009233052000787738, + "acc_norm": 0.902, + "acc_norm_stderr": 0.009406619184621226 + }, + "piqa": { + "acc": 0.7089227421109902, + "acc_stderr": 0.010598612490942586, + "acc_norm": 0.7143634385201306, + "acc_norm_stderr": 0.010539303948661916 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_3.json b/4b284b84bc4/evaluation/4b284b84bc4_3.json new file mode 100644 index 0000000000000000000000000000000000000000..2854b842569f3bee6bebfafae191805274a1c422 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_3.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.338, + "acc_stderr": 0.014965960710224489 + }, + "anli_r2": { + "acc": 0.332, + "acc_stderr": 0.014899597242811476 + }, + "anli_r3": { + "acc": 0.3325, + "acc_stderr": 0.013605417345710526 + }, + "cb": { + "acc": 0.08928571428571429, + "acc_stderr": 0.038450387280282494, + "f1": 0.0871517027863777 + }, + "copa": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371 + }, + "hellaswag": { + "acc": 0.4525990838478391, + "acc_stderr": 0.0049673082544257514, + "acc_norm": 0.5948018323043218, + "acc_norm_stderr": 0.004899270310557971 + }, + "rte": { + "acc": 0.49458483754512633, + "acc_stderr": 0.03009469812323996 + }, + "winogrande": { + "acc": 0.585635359116022, + "acc_stderr": 0.01384484623226856 + }, + "storycloze_2016": { + "acc": 0.7135221806520577, + "acc_stderr": 0.01045510591863303 + }, + "boolq": { + "acc": 0.6079510703363914, + "acc_stderr": 0.008538802914911992 + }, + "arc_easy": { + "acc": 0.5765993265993266, + "acc_stderr": 0.010138671005289047, + "acc_norm": 0.5585016835016835, + "acc_norm_stderr": 0.010189314382749929 + }, + "arc_challenge": { + "acc": 0.27986348122866894, + "acc_stderr": 0.01311904089772592, + "acc_norm": 0.29180887372013653, + "acc_norm_stderr": 0.013284525292403506 + }, + "sciq": { + "acc": 0.908, + "acc_stderr": 0.009144376393151086, + "acc_norm": 0.906, + "acc_norm_stderr": 0.009233052000787738 + }, + "piqa": { + "acc": 0.7257889009793254, + "acc_stderr": 0.010408618664933382, + "acc_norm": 0.7334058759521219, + "acc_norm_stderr": 0.010316749863541365 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_3_lm-eval_global_step80108_2023-01-30-11-26-40_3shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_3_lm-eval_global_step80108_2023-01-30-11-26-40_3shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..2854b842569f3bee6bebfafae191805274a1c422 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_3_lm-eval_global_step80108_2023-01-30-11-26-40_3shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.338, + "acc_stderr": 0.014965960710224489 + }, + "anli_r2": { + "acc": 0.332, + "acc_stderr": 0.014899597242811476 + }, + "anli_r3": { + "acc": 0.3325, + "acc_stderr": 0.013605417345710526 + }, + "cb": { + "acc": 0.08928571428571429, + "acc_stderr": 0.038450387280282494, + "f1": 0.0871517027863777 + }, + "copa": { + "acc": 0.83, + "acc_stderr": 0.03775251680686371 + }, + "hellaswag": { + "acc": 0.4525990838478391, + "acc_stderr": 0.0049673082544257514, + "acc_norm": 0.5948018323043218, + "acc_norm_stderr": 0.004899270310557971 + }, + "rte": { + "acc": 0.49458483754512633, + "acc_stderr": 0.03009469812323996 + }, + "winogrande": { + "acc": 0.585635359116022, + "acc_stderr": 0.01384484623226856 + }, + "storycloze_2016": { + "acc": 0.7135221806520577, + "acc_stderr": 0.01045510591863303 + }, + "boolq": { + "acc": 0.6079510703363914, + "acc_stderr": 0.008538802914911992 + }, + "arc_easy": { + "acc": 0.5765993265993266, + "acc_stderr": 0.010138671005289047, + "acc_norm": 0.5585016835016835, + "acc_norm_stderr": 0.010189314382749929 + }, + "arc_challenge": { + "acc": 0.27986348122866894, + "acc_stderr": 0.01311904089772592, + "acc_norm": 0.29180887372013653, + "acc_norm_stderr": 0.013284525292403506 + }, + "sciq": { + "acc": 0.908, + "acc_stderr": 0.009144376393151086, + "acc_norm": 0.906, + "acc_norm_stderr": 0.009233052000787738 + }, + "piqa": { + "acc": 0.7257889009793254, + "acc_stderr": 0.010408618664933382, + "acc_norm": 0.7334058759521219, + "acc_norm_stderr": 0.010316749863541365 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_4.json b/4b284b84bc4/evaluation/4b284b84bc4_4.json new file mode 100644 index 0000000000000000000000000000000000000000..a9489a5d1c00fe40dd87e54b26b141ed7f66ec28 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_4.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.334, + "acc_stderr": 0.014922019523732961 + }, + "anli_r2": { + "acc": 0.34, + "acc_stderr": 0.014987482264363935 + }, + "anli_r3": { + "acc": 0.35083333333333333, + "acc_stderr": 0.013782212417178197 + }, + "cb": { + "acc": 0.17857142857142858, + "acc_stderr": 0.051642771820087224, + "f1": 0.18279613107199313 + }, + "copa": { + "acc": 0.82, + "acc_stderr": 0.03861229196653697 + }, + "hellaswag": { + "acc": 0.454690300736905, + "acc_stderr": 0.004969251445596333, + "acc_norm": 0.5943039235212109, + "acc_norm_stderr": 0.004900227226433378 + }, + "rte": { + "acc": 0.4548736462093863, + "acc_stderr": 0.029973636495415255 + }, + "winogrande": { + "acc": 0.606156274664562, + "acc_stderr": 0.013732114472668741 + }, + "storycloze_2016": { + "acc": 0.7247461250668092, + "acc_stderr": 0.010328538400500567 + }, + "boolq": { + "acc": 0.6116207951070336, + "acc_stderr": 0.008524357307908792 + }, + "arc_easy": { + "acc": 0.5808080808080808, + "acc_stderr": 0.010124905282491183, + "acc_norm": 0.5711279461279462, + "acc_norm_stderr": 0.010155440652900152 + }, + "arc_challenge": { + "acc": 0.27559726962457337, + "acc_stderr": 0.01305716965576184, + "acc_norm": 0.30802047781569963, + "acc_norm_stderr": 0.01349142951729204 + }, + "sciq": { + "acc": 0.915, + "acc_stderr": 0.008823426366942317, + "acc_norm": 0.919, + "acc_norm_stderr": 0.008632121032139993 + }, + "piqa": { + "acc": 0.7285092491838956, + "acc_stderr": 0.010376251176596137, + "acc_norm": 0.7388465723612623, + "acc_norm_stderr": 0.010248738649935587 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_4_lm-eval_global_step80108_2023-01-30-11-26-40_4shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_4_lm-eval_global_step80108_2023-01-30-11-26-40_4shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..a9489a5d1c00fe40dd87e54b26b141ed7f66ec28 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_4_lm-eval_global_step80108_2023-01-30-11-26-40_4shots_backup.json @@ -0,0 +1,87 @@ +{ + "results": { + "anli_r1": { + "acc": 0.334, + "acc_stderr": 0.014922019523732961 + }, + "anli_r2": { + "acc": 0.34, + "acc_stderr": 0.014987482264363935 + }, + "anli_r3": { + "acc": 0.35083333333333333, + "acc_stderr": 0.013782212417178197 + }, + "cb": { + "acc": 0.17857142857142858, + "acc_stderr": 0.051642771820087224, + "f1": 0.18279613107199313 + }, + "copa": { + "acc": 0.82, + "acc_stderr": 0.03861229196653697 + }, + "hellaswag": { + "acc": 0.454690300736905, + "acc_stderr": 0.004969251445596333, + "acc_norm": 0.5943039235212109, + "acc_norm_stderr": 0.004900227226433378 + }, + "rte": { + "acc": 0.4548736462093863, + "acc_stderr": 0.029973636495415255 + }, + "winogrande": { + "acc": 0.606156274664562, + "acc_stderr": 0.013732114472668741 + }, + "storycloze_2016": { + "acc": 0.7247461250668092, + "acc_stderr": 0.010328538400500567 + }, + "boolq": { + "acc": 0.6116207951070336, + "acc_stderr": 0.008524357307908792 + }, + "arc_easy": { + "acc": 0.5808080808080808, + "acc_stderr": 0.010124905282491183, + "acc_norm": 0.5711279461279462, + "acc_norm_stderr": 0.010155440652900152 + }, + "arc_challenge": { + "acc": 0.27559726962457337, + "acc_stderr": 0.01305716965576184, + "acc_norm": 0.30802047781569963, + "acc_norm_stderr": 0.01349142951729204 + }, + "sciq": { + "acc": 0.915, + "acc_stderr": 0.008823426366942317, + "acc_norm": 0.919, + "acc_norm_stderr": 0.008632121032139993 + }, + "piqa": { + "acc": 0.7285092491838956, + "acc_stderr": 0.010376251176596137, + "acc_norm": 0.7388465723612623, + "acc_norm_stderr": 0.010248738649935587 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0, + "arc_challenge": 0, + "sciq": 0, + "piqa": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_5.json b/4b284b84bc4/evaluation/4b284b84bc4_5.json new file mode 100644 index 0000000000000000000000000000000000000000..44bf5e7100931dff1543cecb01710e6eaf24ce07 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_5.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.348, + "acc_stderr": 0.01507060460376841 + }, + "anli_r2": { + "acc": 0.342, + "acc_stderr": 0.015008706182121738 + }, + "anli_r3": { + "acc": 0.33, + "acc_stderr": 0.013579531277800918 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957636, + "f1": 0.2511904761904762 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.4565823541127266, + "acc_stderr": 0.0049709334202319285, + "acc_norm": 0.6061541525592511, + "acc_norm_stderr": 0.0048760280379419405 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.030009848912529117 + }, + "winogrande": { + "acc": 0.6037884767166535, + "acc_stderr": 0.013746404157154946 + }, + "storycloze_2016": { + "acc": 0.7354355959380011, + "acc_stderr": 0.01020040054171416 + }, + "boolq": { + "acc": 0.6201834862385321, + "acc_stderr": 0.008488668235778613 + }, + "arc_easy": { + "acc": 0.5900673400673401, + "acc_stderr": 0.010091953527506246, + "acc_norm": 0.5791245791245792, + "acc_norm_stderr": 0.01013050216406634 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json b/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json new file mode 100644 index 0000000000000000000000000000000000000000..44bf5e7100931dff1543cecb01710e6eaf24ce07 --- /dev/null +++ b/4b284b84bc4/evaluation/4b284b84bc4_5_lm-eval_global_step80108_2023-01-30-11-26-40_5shots_backup.json @@ -0,0 +1,66 @@ +{ + "results": { + "anli_r1": { + "acc": 0.348, + "acc_stderr": 0.01507060460376841 + }, + "anli_r2": { + "acc": 0.342, + "acc_stderr": 0.015008706182121738 + }, + "anli_r3": { + "acc": 0.33, + "acc_stderr": 0.013579531277800918 + }, + "cb": { + "acc": 0.26785714285714285, + "acc_stderr": 0.05971290310957636, + "f1": 0.2511904761904762 + }, + "copa": { + "acc": 0.81, + "acc_stderr": 0.03942772444036623 + }, + "hellaswag": { + "acc": 0.4565823541127266, + "acc_stderr": 0.0049709334202319285, + "acc_norm": 0.6061541525592511, + "acc_norm_stderr": 0.0048760280379419405 + }, + "rte": { + "acc": 0.5379061371841155, + "acc_stderr": 0.030009848912529117 + }, + "winogrande": { + "acc": 0.6037884767166535, + "acc_stderr": 0.013746404157154946 + }, + "storycloze_2016": { + "acc": 0.7354355959380011, + "acc_stderr": 0.01020040054171416 + }, + "boolq": { + "acc": 0.6201834862385321, + "acc_stderr": 0.008488668235778613 + }, + "arc_easy": { + "acc": 0.5900673400673401, + "acc_stderr": 0.010091953527506246, + "acc_norm": 0.5791245791245792, + "acc_norm_stderr": 0.01013050216406634 + } + }, + "versions": { + "anli_r1": 0, + "anli_r2": 0, + "anli_r3": 0, + "cb": 1, + "copa": 0, + "hellaswag": 0, + "rte": 0, + "winogrande": 0, + "storycloze_2016": 0, + "boolq": 1, + "arc_easy": 0 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json new file mode 100644 index 0000000000000000000000000000000000000000..82a3a3c7992573872fe834ed1fb61c7acff79490 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.37466495114095594, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03116802815957843 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07404480158534202, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0014902788141002475 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.31212436257013715, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004568108823452381 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.1135227535532633, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0020226161176734882 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.035086188446589825, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0009237445527173 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1533304291888016, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0032417614710948096 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05397463032306079, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0012920845251919216 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.07143892188082063, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013733993783630446 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3047679674724977, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0044697317892439005 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10988632514530636, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0018918967086146744 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07084668944691519, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001397360756298291 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.2997116780220477, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004343713168500645 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10869588574047251, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019005624996464461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json new file mode 100644 index 0000000000000000000000000000000000000000..00295ac96f29c646e919e62c585beae92f63edc8 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.45724995573151395, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03077826387670849 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07380410032029427, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0013950698189371862 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.36979685211837077, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005157808096954796 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11499758191359032, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018677715006027427 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03456277372759004, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008655744665680597 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1825581650807761, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0037692839645507878 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.053994419166879803, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001195483166076756 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06966555184597864, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0012665956123840754 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.34681007385001145, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004679678346399935 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10864238895715833, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017176631324308034 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.07031937468305927, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0013198255014853241 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.35067051099532537, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004753075684894469 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10948950405713026, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017632128912827953 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json new file mode 100644 index 0000000000000000000000000000000000000000..dbacafed4ad2cc8caf5389075788e99833a0385f --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.5219512537266193, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.0212930030956169 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07218926971887347, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012167661516642302 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.38930719643519357, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.004954916384960566 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11405231602258174, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.001654644088764185 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03343887788145288, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007607255949351679 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.1941492016104801, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003831607721687585 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.0530914234584236, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0010851048561266318 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06799635945918656, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0011456716958398708 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.36060457316843075, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004393491915214021 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10723485397749022, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015594878594255188 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06917630307565754, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011778495782492331 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.3693625063067186, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004599410126151537 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10913009935368997, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015983050972786382 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json new file mode 100644 index 0000000000000000000000000000000000000000..8d4052ac9ca0d87d4c88db309faa8a7a0ff8a735 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6352070705519831, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.032408525336239634 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07232274024412362, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0011648296294780197 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.40875337262130357, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005170431645641866 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11545860904836557, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016293321313512947 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03357621675403705, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007258241400547502 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.20421979666303372, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0038705321992327218 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05387918484618448, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001059642873696044 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.0670219793734058, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0010711080482606746 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3719335535759742, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004473658738704575 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10673572587703606, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001490340938407792 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06880374952093221, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011085999525674158 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.38592664938862503, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.004766870365189962 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.10969517770602641, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015436923533265093 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json new file mode 100644 index 0000000000000000000000000000000000000000..9bd3af4b96d17e369c4384d8756358a93f5d6ce3 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.6721480622931579, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.04364421620033658 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.07312830538850522, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0012172573610058964 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.41786672242074735, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.00514089320113892 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11682333840421252, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0016784638187503072 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03391327016863078, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007434663573137818 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.21097279270777233, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003962368460085698 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05451282884342794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001075049065599178 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06674973450321794, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001076625707211798 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3763932336189791, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004418572944831253 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10645004232970325, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014835449120631779 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06947768209278911, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0011601752390396401 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.39433935812714066, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.00474918433328278 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.1108401266606722, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015949626703580934 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json new file mode 100644 index 0000000000000000000000000000000000000000..49a1dd146daefde5987477fc9521a4bf48091091 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-web_nlg_en_PALM_prompt_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "bleu": 0.7296458163665333, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.03550662426728089 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_precision": 0.073601220000616, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.001145132160087058 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_recall": 0.4252463565222065, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.005191805176994548 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge1_fmeasure": 0.11795106654953416, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0015900535376520654 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_precision": 0.03391186812656995, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007058000824068418 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_recall": 0.2132946176974872, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.003991916242021659 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rouge2_fmeasure": 0.05471954734528363, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00103145574907618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_precision": 0.06738065913816185, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.001053247887367618 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_recall": 0.3825929171137152, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.004448563155136455 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeL_fmeasure": 0.10768674313552495, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0014533174914988178 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_precision": 0.06988985169584011, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.001094990559251859 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_recall": 0.40083936944093385, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0047880465656518125 + }, + { + "task_name": "GEM/web_nlg_en", + "prompt_name": "PALM_prompt", + "rougeLsum_fmeasure": 0.11184463338518444, + "dataset_path": "GEM/web_nlg", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0015162341039388436 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json new file mode 100644 index 0000000000000000000000000000000000000000..772ac4147a5c05b0e5055ae612e8e3f8411d07a2 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.14903430496900869, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0019044306456448882 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.25460834562037304, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0027808818609942765 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.17464450784994165, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0018866549663747307 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.02932852184049837, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0007444651216824196 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.05378048278152194, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0015115343557708684 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.03487366784140555, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0008538791169940822 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.11560989242228453, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0013437666268655546 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.20506193764704436, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002299748596638709 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13717038261116743, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.001369756093492625 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.13582138493978418, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0017239221521953576 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.23341669122784228, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0025774317116309797 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.1593997663047623, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0017134556998076298 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.5192817590705794, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06494006665273461 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json new file mode 100644 index 0000000000000000000000000000000000000000..879247551bb552afbbd0edd76a5877ac02cc0f85 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.19528801849094218, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002201427724682199 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3257340657226586, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.002895580827853013 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22509002102944192, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019838100403287333 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04999822434377242, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0010227131559085638 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.08773922376052366, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0018594738182377334 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05802448056432808, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001078013458633595 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13922763000575006, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0014964090541992776 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.24015503219395376, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023374560954638485 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.16185295875984437, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013532637497389792 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18280815280692314, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0020740368495163132 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.3054903541601766, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0027607171534094984 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.21073250157763135, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018643526489330914 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.059381627710027, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06573083880760217 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json new file mode 100644 index 0000000000000000000000000000000000000000..5cfe0539bae40e31c04a4044c872a833b36d7eee --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.2022231395663083, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.002327440444373304 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.3212542850069266, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0028325100234559927 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.22687895073683426, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0019722087479110005 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.05298207477606098, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0011059450545409227 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0867754880829245, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0018124553730314602 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.05947240390341843, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.0011148780082595852 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.14766643493041728, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0016788338597698632 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.24166027084574745, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0023102550180774937 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.1667829458147027, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0013918547568942457 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.18955567145755783, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0021818874452280475 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.30206706797305843, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0026937255424983246 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.21293287161862917, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018545411601376014 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.26946391692664, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.06690865495365421 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json new file mode 100644 index 0000000000000000000000000000000000000000..092781788d3bf615f5bd84727e5fce692c4d31e1 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.17603411917572567, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0027088089976067017 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.2612353818883475, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0033992744560135033 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.18692561650624942, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0023335888686542303 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.04559336904702604, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0012112817237957673 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.06990530615420809, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0017473693679177455 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.04827221140963156, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.001080517273254867 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.13188323013007386, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0020767846091407608 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.19847294800292903, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002681619286646281 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.13971894217937625, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0017066104221784629 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.1648347813541367, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0025407224451518312 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.24568842200265278, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0032363366658664243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.17522886133321208, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0021928506565670742 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 3.0982581904465443, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.09308367989527225 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json new file mode 100644 index 0000000000000000000000000000000000000000..4db5ebbc47b2ca13750641dcc212f005638de47d --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.05811148139738531, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0022224537018629854 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.08558209366311444, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.003003751048628007 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.058933509019544375, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.002002584327257243 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.014672079613835081, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0008619146516289935 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.02381134201045262, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0012808609018768735 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.015382205296321172, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.000747616978372682 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.04421230588130963, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0017253556363268328 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.06610553705292643, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.002375418784846169 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.04464141924307046, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0015078323046653491 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.05425853906264316, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.002080511020141031 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.07994658087674156, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0028206819757526707 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.05503716623523508, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0018776883247322259 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 0.6565411757725536, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 0.052324362211905354 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json new file mode 100644 index 0000000000000000000000000000000000000000..6eb03e7cdec54e00388c44e391a830838dd4cd89 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_GEM-wiki_lingua_en_tldr_en_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_precision": 0.009018320417866373, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_precision_stderr": 0.0009515455079029407 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_recall": 0.013477068819464475, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_recall_stderr": 0.0013177953296362858 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge1_fmeasure": 0.009010391314895649, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge1_fmeasure_stderr": 0.0008608910415638986 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_precision": 0.002361799926713296, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_precision_stderr": 0.0003370630501668582 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_recall": 0.0034894806830281814, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_recall_stderr": 0.0004812458467814835 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rouge2_fmeasure": 0.002248573836077081, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rouge2_fmeasure_stderr": 0.00028843241279034187 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_precision": 0.007020816303734581, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_precision_stderr": 0.0007477069794008652 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_recall": 0.010607117596650792, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_recall_stderr": 0.0010619737593296397 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeL_fmeasure": 0.006956210385184894, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeL_fmeasure_stderr": 0.0006626528079735124 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_precision": 0.008615795202441968, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_precision_stderr": 0.0009202005223985237 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_recall": 0.012858462170759524, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_recall_stderr": 0.0012682727385585405 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "rougeLsum_fmeasure": 0.008558581769365812, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0008212318653585135 + }, + { + "task_name": "GEM/wiki_lingua_en", + "prompt_name": "tldr_en", + "bleu": 1.0169232766561087e-06, + "dataset_path": "GEM/wiki_lingua", + "dataset_name": "en", + "subset": null, + "bleu_stderr": 2.0148976704692234e-06 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json new file mode 100644 index 0000000000000000000000000000000000000000..7c531389f205c1f6641bc71b614a558115ef68aa --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 0.14846670480161736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.013127750204704683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.26299103271373675, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.006382723076774683 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.05264625320052111, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0014565379578075452 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.07826511485489751, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0019164435017392188 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.1576462193644736, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.005855563265723578 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.01645135749156905, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0007166454954847944 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.026122584651124606, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.00101915256853543 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2524687494414541, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.006192701638587216 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.04953446876968493, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0012966364411702697 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.0738166454356822, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017248337500189589 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.25330917167188566, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.006207826569406349 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.04925315584474901, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0012892531673512504 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.07392255592729936, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.00174327269105063 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json new file mode 100644 index 0000000000000000000000000000000000000000..ca906267e4fa2d607b9e0015263cc5fa7b645208 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 9.167405083071978, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.09131785485896106 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4027750555180633, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0022654704081963494 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.46782170590516187, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.0029091530201079247 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4160674099572731, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020114833441626043 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.16815397557349618, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0016540332686745738 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.19988678530914838, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0021644647347172754 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.17514483800816646, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0016790153644324725 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.27883038558148265, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.001734453659792866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3267350269933985, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0023849450074570734 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.28884448468062934, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0016305979819716226 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.33427994949554696, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.00212000114598495 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3882701548838279, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0026955155305927177 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3452818942622058, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019526544280227683 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json new file mode 100644 index 0000000000000000000000000000000000000000..6fc8de743ca42830f3bfbf3879d931cfdca3d193 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 10.796301153911008, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.146708341661776 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.41448086950190893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002308739240960833 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.47403110687392724, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002885238583477618 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4253927318807347, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020174640710145207 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.18273899527170673, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.001736753975334171 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.21447823413610112, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022428818903169902 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.18933491861931817, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001739001650440241 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.2947058978443582, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0018691331443875447 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3390095486229893, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024297003361930447 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3029238715575528, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.001712217734742568 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.34519699878769394, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0021888771482807224 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.3948779703096386, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002705400716089457 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.35424136247162186, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0019838710065518987 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json new file mode 100644 index 0000000000000000000000000000000000000000..bcb1318add183a4d1df4b2693616fc67f6fa87ba --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.701646150139581, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14121347748513918 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4240814949754891, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002335304307800915 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.49091681381630753, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.00293584370599599 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.4379907319797712, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.002033620820037117 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.19473673244504042, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018208182531307374 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.2308195484401856, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.002332056213772715 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.20260082190070094, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.001797391946128592 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3052254250489387, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.0019294151867184208 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.35489792991067104, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024799761788283346 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3154563500828602, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017391940408011135 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3549420293695019, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.002227993320953577 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4111023178790994, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027708358218005813 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.36657732824376943, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020166468270683123 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json new file mode 100644 index 0000000000000000000000000000000000000000..988ad5476f956c621e11a8fe7104049321663017 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.678623201115563, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.1438023448589331 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.42802077277555306, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.002424017667020404 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.4941727757999444, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002864707318340049 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.44123731345241246, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020394573256137002 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.19838194504781104, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0019042215339010943 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.23317874468894614, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0023244742635299934 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.20525503989164437, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0018231190752503231 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3069996273775671, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002010456416143571 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3562550112067379, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024675677606706826 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.3168165933964199, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017773944657815321 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.3572288837187459, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0023220174574098856 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.4131891637059535, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.002781997432767609 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.3684518522765776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020683920310729333 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json new file mode 100644 index 0000000000000000000000000000000000000000..5977eb6846791476ee3c98acc5ecfae4eaac5b22 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_e2e_nlg_cleaned_generate_text_restaurant_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "bleu": 11.256627548432805, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "bleu_stderr": 0.14543377736086152 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_precision": 0.4215376032960452, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_precision_stderr": 0.0024025181710894523 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_recall": 0.48727427299265463, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_recall_stderr": 0.002867231297396204 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge1_fmeasure": 0.43467772454619635, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge1_fmeasure_stderr": 0.0020177075108414894 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_precision": 0.19324475626688103, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_precision_stderr": 0.0018664258727883866 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_recall": 0.22689708066119227, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_recall_stderr": 0.0022677822733911217 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rouge2_fmeasure": 0.19984409683367668, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rouge2_fmeasure_stderr": 0.0017840474360257142 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_precision": 0.3043439759839776, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_precision_stderr": 0.002001573450675394 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_recall": 0.3536697494352659, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_recall_stderr": 0.0024814074442155673 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeL_fmeasure": 0.31426345805670414, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeL_fmeasure_stderr": 0.0017762603380325856 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_precision": 0.35399958676397025, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_precision_stderr": 0.0022946023988637647 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_recall": 0.40981680597146897, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_recall_stderr": 0.0027603392115545873 + }, + { + "task_name": "e2e_nlg_cleaned", + "prompt_name": "generate_text_restaurant", + "rougeLsum_fmeasure": 0.36514922201465566, + "dataset_path": "e2e_nlg_cleaned", + "dataset_name": null, + "subset": null, + "rougeLsum_fmeasure_stderr": 0.0020264847795391197 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_0.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_0.json new file mode 100644 index 0000000000000000000000000000000000000000..6ea8df207b0e9061d1f1cf2a8ca1d0097d934036 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_0.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.15936520803829934, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.002502914065388867 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.36698072640704477, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004683622882613499 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.2162684407733824, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0028487305378065864 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.044026641927011036, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0016092489576812487 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.10240085620355213, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0032524404949168598 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.059415288606804666, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0019237063507349787 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1250497631451513, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.002063727102434744 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2883013244617606, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.003758676199438092 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.16944484885441244, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0022672242410205668 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.12672592447230924, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0021949367251175547 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2924636147885988, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004172229113513385 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.17181025266219813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0024925143856936887 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 2.4667492251754903, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.10381088859974377 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 0, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_1.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_1.json new file mode 100644 index 0000000000000000000000000000000000000000..0297916fc656899dbbd8a1603ca66c00ca041f32 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_1.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13872518272148576, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0019196010106841707 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.34230252867072486, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004465857111154106 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19511361094013366, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025883144750141985 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03163153944854582, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0010782973297168107 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08173836317054556, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028348136277117873 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04506092989376365, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015273324009655352 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.10692232891845452, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.001417359741210213 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26668023859416595, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035247154238071715 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15075736335997478, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.0019327841457787809 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.1091361614782161, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015647258593366256 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.27137465028647, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003816794352052926 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15379184945424418, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0021360349555465917 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.758172461446717, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.07910003790624576 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 1, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_2.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_2.json new file mode 100644 index 0000000000000000000000000000000000000000..67cedb3e95938de64510b22d1af64ab6ac49cfb9 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_2.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13980890149209954, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0018977428372584247 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3402939621206596, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.00439029308436291 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.19587821805854708, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0025521664288773132 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03376773335297692, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0011277676246399322 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0855882084980882, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.002933007569998225 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04784619134007039, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0015903598069214155 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.1126666102037684, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0014774766678267424 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.2759357644277514, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0035546429323666893 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.15806290558333977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.001998316404733819 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10915831064990977, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0015695216746497264 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.2680641977382496, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003807444122985714 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.15327775753143216, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002141837328174842 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.929152474522292, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.06152527877922885 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 2, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_3.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_3.json new file mode 100644 index 0000000000000000000000000000000000000000..473d3e7fd21755ce15a432747ee2bd06873b8927 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_3.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.13763916790173225, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0021817751235199668 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.3193392811416741, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.004739221494631209 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.18802043752912012, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.002755991807062541 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.03388199846714144, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0012286320326522608 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.08180524247777735, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0028960546548511567 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.04680703390738515, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0016176840589681419 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.11334029248894266, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0017604231336699231 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.26418540865577744, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0039191139616055375 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.1549296797013188, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002213023071302344 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.10737702854211702, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0018328777923363876 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.25043089015992415, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.004037910961250234 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.14663858566665416, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.002296870650164813 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.9627474167404957, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.05305571895946147 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 3, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_4.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_4.json new file mode 100644 index 0000000000000000000000000000000000000000..bce214ef8fad3b50af029f2f0249894063ab1e22 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_4.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.04603105454485719, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.00291203860838327 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.08129731716534012, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0045890356974722666 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.05322501092072186, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0029794854167583293 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.011578470150574793, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.0013065637311316816 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.020777972525703254, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.0017120733784709268 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.013388421641777913, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0011079313127855075 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.03796068206126734, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0024963984987536425 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.06631805498752238, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.0037355969047499307 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.04330307819667933, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.002401031515475891 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.03683944866828292, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.00248168608412277 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.06385955481258902, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.003696411627899061 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.04176500700971651, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0023719884034967345 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.0275910587689852, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 0.12038281046093742 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 4, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file diff --git a/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_5.json b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_5.json new file mode 100644 index 0000000000000000000000000000000000000000..228cbb4b58edba11902b2b79d8b4585b334bc8b4 --- /dev/null +++ b/4b284b84bc4/evaluation/generation/slim.4b284b84bc4_gem_xsum_article_DOC_summary_5.json @@ -0,0 +1,133 @@ +{ + "results": [ + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_precision": 0.00321897943071785, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_precision_stderr": 0.0009979849053940774 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_recall": 0.002638556229987107, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_recall_stderr": 0.0008069095304213252 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge1_fmeasure": 0.002850511566655562, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge1_fmeasure_stderr": 0.0008725813491317338 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_precision": 0.0008682722133420189, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_precision_stderr": 0.00042068273727211144 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_recall": 0.0006912233160845813, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_recall_stderr": 0.00036262248705394215 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rouge2_fmeasure": 0.0007583670569643345, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rouge2_fmeasure_stderr": 0.0003812254884717486 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_precision": 0.0024287372106315966, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_precision_stderr": 0.0007289443610313556 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_recall": 0.002057235735729571, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_recall_stderr": 0.000619733025851515 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeL_fmeasure": 0.002188808353154973, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeL_fmeasure_stderr": 0.000656190328926511 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_precision": 0.002734584249275528, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_precision_stderr": 0.0008545960784962966 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_recall": 0.002247308506447503, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_recall_stderr": 0.0006766768468342992 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "rougeLsum_fmeasure": 0.0024206446753818178, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "rougeLsum_fmeasure_stderr": 0.0007355689947184377 + }, + { + "task_name": "gem_xsum", + "prompt_name": "article_DOC_summary", + "bleu": 1.531796041787971e-36, + "dataset_path": "GEM/xsum", + "dataset_name": null, + "subset": "", + "bleu_stderr": 1.3347176862997463e-30 + } + ], + "config": { + "model": "hf-causal", + "model_args": "pretrained=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/lm1-4b2-84b-c4-repetitions/4b284b84bc4/transformers,use_accelerate=True,tokenizer=/pfs/lustrep4/scratch/project_462000119/muennighoff/nov-2022-bettercom/gpt2,dtype=bfloat16", + "task_args": "", + "num_fewshot": 5, + "batch_size": 16, + "device": "cuda", + "use_cache": false, + "limit": 3000, + "bootstrap_iters": 10, + "seed": 1234 + } +} \ No newline at end of file