diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-1000/trainer_state.json" @@ -0,0 +1,6099 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7393032067276591, + "eval_steps": 100, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 7.3170731707317065e-06, + "loss": 0.7162, + "step": 1 + }, + { + "epoch": 0.0, + "learning_rate": 1.4634146341463413e-05, + "loss": 0.8676, + "step": 2 + }, + { + "epoch": 0.0, + "learning_rate": 2.1951219512195117e-05, + "loss": 0.8797, + "step": 3 + }, + { + "epoch": 0.0, + "learning_rate": 2.9268292682926826e-05, + "loss": 0.6815, + "step": 4 + }, + { + "epoch": 0.0, + "learning_rate": 3.6585365853658535e-05, + "loss": 0.6264, + "step": 5 + }, + { + "epoch": 0.0, + "learning_rate": 4.3902439024390234e-05, + "loss": 0.711, + "step": 6 + }, + { + "epoch": 0.01, + "learning_rate": 5.121951219512195e-05, + "loss": 0.8029, + "step": 7 + }, + { + "epoch": 0.01, + "learning_rate": 5.853658536585365e-05, + "loss": 0.5576, + "step": 8 + }, + { + "epoch": 0.01, + "learning_rate": 6.585365853658536e-05, + "loss": 0.7127, + "step": 9 + }, + { + "epoch": 0.01, + "learning_rate": 7.317073170731707e-05, + "loss": 0.5231, + "step": 10 + }, + { + "epoch": 0.01, + "learning_rate": 8.048780487804878e-05, + "loss": 1.1346, + "step": 11 + }, + { + "epoch": 0.01, + "learning_rate": 8.780487804878047e-05, + "loss": 0.6838, + "step": 12 + }, + { + "epoch": 0.01, + "learning_rate": 9.512195121951219e-05, + "loss": 0.8151, + "step": 13 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001024390243902439, + "loss": 0.9672, + "step": 14 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001097560975609756, + "loss": 0.7119, + "step": 15 + }, + { + "epoch": 0.01, + "learning_rate": 0.0001170731707317073, + "loss": 0.701, + "step": 16 + }, + { + "epoch": 0.01, + "learning_rate": 0.000124390243902439, + "loss": 0.7036, + "step": 17 + }, + { + "epoch": 0.01, + "learning_rate": 0.00013170731707317073, + "loss": 0.4933, + "step": 18 + }, + { + "epoch": 0.01, + "learning_rate": 0.00013902439024390242, + "loss": 0.9684, + "step": 19 + }, + { + "epoch": 0.01, + "learning_rate": 0.00014634146341463414, + "loss": 1.195, + "step": 20 + }, + { + "epoch": 0.02, + "learning_rate": 0.00015365853658536583, + "loss": 0.7636, + "step": 21 + }, + { + "epoch": 0.02, + "learning_rate": 0.00016097560975609755, + "loss": 0.5474, + "step": 22 + }, + { + "epoch": 0.02, + "learning_rate": 0.00016829268292682927, + "loss": 1.0586, + "step": 23 + }, + { + "epoch": 0.02, + "learning_rate": 0.00017560975609756094, + "loss": 0.4522, + "step": 24 + }, + { + "epoch": 0.02, + "learning_rate": 0.00018292682926829266, + "loss": 0.5121, + "step": 25 + }, + { + "epoch": 0.02, + "learning_rate": 0.00019024390243902437, + "loss": 0.9219, + "step": 26 + }, + { + "epoch": 0.02, + "learning_rate": 0.0001975609756097561, + "loss": 0.6772, + "step": 27 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002048780487804878, + "loss": 0.7117, + "step": 28 + }, + { + "epoch": 0.02, + "learning_rate": 0.00021219512195121948, + "loss": 0.8291, + "step": 29 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002195121951219512, + "loss": 0.4971, + "step": 30 + }, + { + "epoch": 0.02, + "learning_rate": 0.00022682926829268292, + "loss": 0.6877, + "step": 31 + }, + { + "epoch": 0.02, + "learning_rate": 0.0002341463414634146, + "loss": 0.4998, + "step": 32 + }, + { + "epoch": 0.02, + "learning_rate": 0.00024146341463414633, + "loss": 0.5154, + "step": 33 + }, + { + "epoch": 0.03, + "learning_rate": 0.000248780487804878, + "loss": 0.7322, + "step": 34 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002560975609756097, + "loss": 0.7293, + "step": 35 + }, + { + "epoch": 0.03, + "learning_rate": 0.00026341463414634146, + "loss": 0.7618, + "step": 36 + }, + { + "epoch": 0.03, + "learning_rate": 0.00027073170731707315, + "loss": 1.0949, + "step": 37 + }, + { + "epoch": 0.03, + "learning_rate": 0.00027804878048780484, + "loss": 0.5851, + "step": 38 + }, + { + "epoch": 0.03, + "learning_rate": 0.00028536585365853654, + "loss": 0.559, + "step": 39 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002926829268292683, + "loss": 0.6606, + "step": 40 + }, + { + "epoch": 0.03, + "learning_rate": 0.0003, + "loss": 0.4873, + "step": 41 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029999956931929273, + "loss": 0.3105, + "step": 42 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029999827727964407, + "loss": 0.383, + "step": 43 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002999961238884735, + "loss": 0.7269, + "step": 44 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029999310915814663, + "loss": 0.5993, + "step": 45 + }, + { + "epoch": 0.03, + "learning_rate": 0.00029998923310597524, + "loss": 0.5674, + "step": 46 + }, + { + "epoch": 0.03, + "learning_rate": 0.0002999844957542173, + "loss": 0.4887, + "step": 47 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002999788971300765, + "loss": 1.0146, + "step": 48 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029997243726570256, + "loss": 0.8546, + "step": 49 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002999651161981906, + "loss": 1.1779, + "step": 50 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002999569339695812, + "loss": 0.6413, + "step": 51 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029994789062686003, + "loss": 0.5926, + "step": 52 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002999379862219577, + "loss": 0.828, + "step": 53 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002999272208117494, + "loss": 0.5488, + "step": 54 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002999155944580544, + "loss": 0.645, + "step": 55 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029990310722763613, + "loss": 0.784, + "step": 56 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002998897591922013, + "loss": 0.4035, + "step": 57 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029987555042839976, + "loss": 1.065, + "step": 58 + }, + { + "epoch": 0.04, + "learning_rate": 0.00029986048101782414, + "loss": 0.6875, + "step": 59 + }, + { + "epoch": 0.04, + "learning_rate": 0.0002998445510470091, + "loss": 0.5348, + "step": 60 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029982776060743106, + "loss": 0.7269, + "step": 61 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029981010979550766, + "loss": 0.4813, + "step": 62 + }, + { + "epoch": 0.05, + "learning_rate": 0.000299791598712597, + "loss": 0.4941, + "step": 63 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002997722274649974, + "loss": 0.4421, + "step": 64 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002997519961639464, + "loss": 0.3293, + "step": 65 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029973090492562045, + "loss": 0.4955, + "step": 66 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002997089538711341, + "loss": 0.8179, + "step": 67 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029968614312653923, + "loss": 0.9704, + "step": 68 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029966247282282453, + "loss": 0.6526, + "step": 69 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002996379430959145, + "loss": 0.3936, + "step": 70 + }, + { + "epoch": 0.05, + "learning_rate": 0.000299612554086669, + "loss": 0.5325, + "step": 71 + }, + { + "epoch": 0.05, + "learning_rate": 0.000299586305940882, + "loss": 0.4114, + "step": 72 + }, + { + "epoch": 0.05, + "learning_rate": 0.00029955919880928124, + "loss": 0.4236, + "step": 73 + }, + { + "epoch": 0.05, + "learning_rate": 0.0002995312328475268, + "loss": 0.4274, + "step": 74 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002995024082162107, + "loss": 0.4362, + "step": 75 + }, + { + "epoch": 0.06, + "learning_rate": 0.000299472725080856, + "loss": 0.3605, + "step": 76 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029944218361191514, + "loss": 0.5172, + "step": 77 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029941078398476985, + "loss": 0.5467, + "step": 78 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029937852637972967, + "loss": 0.3366, + "step": 79 + }, + { + "epoch": 0.06, + "learning_rate": 0.000299345410982031, + "loss": 0.4271, + "step": 80 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002993114379818359, + "loss": 0.9314, + "step": 81 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029927660757423123, + "loss": 0.6369, + "step": 82 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002992409199592276, + "loss": 0.7792, + "step": 83 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029920437534175785, + "loss": 0.3398, + "step": 84 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029916697393167615, + "loss": 0.6307, + "step": 85 + }, + { + "epoch": 0.06, + "learning_rate": 0.00029912871594375666, + "loss": 0.5283, + "step": 86 + }, + { + "epoch": 0.06, + "learning_rate": 0.0002990896015976924, + "loss": 0.5731, + "step": 87 + }, + { + "epoch": 0.07, + "learning_rate": 0.000299049631118094, + "loss": 0.5389, + "step": 88 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002990088047344883, + "loss": 0.6683, + "step": 89 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029896712268131717, + "loss": 1.0819, + "step": 90 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029892458519793595, + "loss": 0.2681, + "step": 91 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002988811925286124, + "loss": 0.7518, + "step": 92 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029883694492252474, + "loss": 0.5616, + "step": 93 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002987918426337611, + "loss": 0.687, + "step": 94 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002987458859213172, + "loss": 0.561, + "step": 95 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002986990750490953, + "loss": 0.3038, + "step": 96 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029865141028590257, + "loss": 0.4021, + "step": 97 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002986028919054496, + "loss": 0.4647, + "step": 98 + }, + { + "epoch": 0.07, + "learning_rate": 0.0002985535201863488, + "loss": 0.3641, + "step": 99 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029850329541211283, + "loss": 0.5877, + "step": 100 + }, + { + "epoch": 0.07, + "eval_loss": 0.7984210848808289, + "eval_runtime": 12.9946, + "eval_samples_per_second": 0.847, + "eval_steps_per_second": 0.847, + "step": 100 + }, + { + "epoch": 0.07, + "learning_rate": 0.00029845221787115286, + "loss": 0.4791, + "step": 101 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002984002878567771, + "loss": 0.5435, + "step": 102 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002983475056671889, + "loss": 0.346, + "step": 103 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029829387160548525, + "loss": 0.3577, + "step": 104 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002982393859796548, + "loss": 0.3761, + "step": 105 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002982393859796548, + "loss": 0.7425, + "step": 106 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002981840491025764, + "loss": 0.3844, + "step": 107 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029812786129201706, + "loss": 0.3529, + "step": 108 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002980708228706302, + "loss": 0.4415, + "step": 109 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029801293416595374, + "loss": 0.4336, + "step": 110 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029795419551040833, + "loss": 0.5404, + "step": 111 + }, + { + "epoch": 0.08, + "learning_rate": 0.00029789460724129543, + "loss": 0.7089, + "step": 112 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002978341697007952, + "loss": 0.5796, + "step": 113 + }, + { + "epoch": 0.08, + "learning_rate": 0.0002977728832359648, + "loss": 0.6561, + "step": 114 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029771074819873617, + "loss": 0.7428, + "step": 115 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029764776494591423, + "loss": 0.6707, + "step": 116 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029758393383917447, + "loss": 0.68, + "step": 117 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002975192552450613, + "loss": 0.4643, + "step": 118 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002974537295349857, + "loss": 0.6456, + "step": 119 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002973873570852231, + "loss": 0.5805, + "step": 120 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029732013827691125, + "loss": 0.3574, + "step": 121 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002972520734960482, + "loss": 0.5281, + "step": 122 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029718316313348964, + "loss": 0.3682, + "step": 123 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002971134075849472, + "loss": 0.3776, + "step": 124 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002970428072509857, + "loss": 0.8614, + "step": 125 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002969713625370212, + "loss": 0.308, + "step": 126 + }, + { + "epoch": 0.09, + "learning_rate": 0.0002968990738533186, + "loss": 0.5767, + "step": 127 + }, + { + "epoch": 0.09, + "learning_rate": 0.00029682594161498896, + "loss": 0.4703, + "step": 128 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029675196624198756, + "loss": 0.4235, + "step": 129 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029667714815911136, + "loss": 0.3727, + "step": 130 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029660148779599643, + "loss": 0.3763, + "step": 131 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002965249855871155, + "loss": 0.5048, + "step": 132 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002964476419717756, + "loss": 0.4571, + "step": 133 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002963694573941153, + "loss": 0.1698, + "step": 134 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002962904323031027, + "loss": 0.3583, + "step": 135 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029621056715253206, + "loss": 0.3942, + "step": 136 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029612986240102186, + "loss": 0.5731, + "step": 137 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002960483185120117, + "loss": 0.5174, + "step": 138 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029596593595376016, + "loss": 0.2378, + "step": 139 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029588271519934145, + "loss": 0.6014, + "step": 140 + }, + { + "epoch": 0.1, + "learning_rate": 0.00029579865672664334, + "loss": 0.4233, + "step": 141 + }, + { + "epoch": 0.1, + "learning_rate": 0.0002957137610183639, + "loss": 0.8406, + "step": 142 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029562802856200913, + "loss": 0.433, + "step": 143 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029554145984988985, + "loss": 0.3892, + "step": 144 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029545405537911905, + "loss": 0.3632, + "step": 145 + }, + { + "epoch": 0.11, + "learning_rate": 0.000295365815651609, + "loss": 0.6153, + "step": 146 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002952767411740683, + "loss": 0.3268, + "step": 147 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029518683245799914, + "loss": 0.7161, + "step": 148 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029509609001969413, + "loss": 0.4311, + "step": 149 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002950045143802336, + "loss": 0.7437, + "step": 150 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002949121060654821, + "loss": 0.3991, + "step": 151 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029481886560608634, + "loss": 0.4817, + "step": 152 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029472479353747104, + "loss": 0.2903, + "step": 153 + }, + { + "epoch": 0.11, + "learning_rate": 0.0002946298903998365, + "loss": 0.4583, + "step": 154 + }, + { + "epoch": 0.11, + "learning_rate": 0.00029453415673815555, + "loss": 0.4707, + "step": 155 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002944375931021699, + "loss": 0.6428, + "step": 156 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002943402000463875, + "loss": 0.3799, + "step": 157 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002942419781300793, + "loss": 0.207, + "step": 158 + }, + { + "epoch": 0.12, + "learning_rate": 0.00029414292791727563, + "loss": 0.1921, + "step": 159 + }, + { + "epoch": 0.12, + "learning_rate": 0.00029404304997676347, + "loss": 0.5306, + "step": 160 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002939423448820827, + "loss": 0.2036, + "step": 161 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002938408132115233, + "loss": 0.5733, + "step": 162 + }, + { + "epoch": 0.12, + "learning_rate": 0.00029373845554812167, + "loss": 0.3922, + "step": 163 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002936352724796575, + "loss": 0.3732, + "step": 164 + }, + { + "epoch": 0.12, + "learning_rate": 0.00029353126459865004, + "loss": 0.3472, + "step": 165 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002934264325023553, + "loss": 0.2677, + "step": 166 + }, + { + "epoch": 0.12, + "learning_rate": 0.000293320776792762, + "loss": 0.5054, + "step": 167 + }, + { + "epoch": 0.12, + "learning_rate": 0.00029321429807658856, + "loss": 0.3407, + "step": 168 + }, + { + "epoch": 0.12, + "learning_rate": 0.0002931069969652793, + "loss": 0.3308, + "step": 169 + }, + { + "epoch": 0.13, + "learning_rate": 0.0002929988740750011, + "loss": 0.4544, + "step": 170 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029288993002663996, + "loss": 0.7072, + "step": 171 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029278016544579717, + "loss": 0.5157, + "step": 172 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029266958096278587, + "loss": 0.1607, + "step": 173 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029255817721262744, + "loss": 0.3236, + "step": 174 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029244595483504784, + "loss": 0.4895, + "step": 175 + }, + { + "epoch": 0.13, + "learning_rate": 0.000292332914474474, + "loss": 0.3522, + "step": 176 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029221905678002976, + "loss": 0.7603, + "step": 177 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029210438240553276, + "loss": 0.502, + "step": 178 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029198889200949017, + "loss": 0.7086, + "step": 179 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029187258625509513, + "loss": 0.4529, + "step": 180 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029175546581022295, + "loss": 0.5601, + "step": 181 + }, + { + "epoch": 0.13, + "learning_rate": 0.00029163753134742713, + "loss": 0.3547, + "step": 182 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002915187835439356, + "loss": 0.3628, + "step": 183 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002913992230816469, + "loss": 0.4702, + "step": 184 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002912788506471262, + "loss": 0.6325, + "step": 185 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029115766693160126, + "loss": 0.8866, + "step": 186 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029103567263095856, + "loss": 0.5105, + "step": 187 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029091286844573936, + "loss": 0.4985, + "step": 188 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029078925508113564, + "loss": 0.3461, + "step": 189 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029066483324698584, + "loss": 0.4485, + "step": 190 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002905396036577711, + "loss": 0.6662, + "step": 191 + }, + { + "epoch": 0.14, + "learning_rate": 0.000290413567032611, + "loss": 0.2159, + "step": 192 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002902867240952595, + "loss": 0.2389, + "step": 193 + }, + { + "epoch": 0.14, + "learning_rate": 0.00029015907557410065, + "loss": 0.226, + "step": 194 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002900306222021445, + "loss": 0.4957, + "step": 195 + }, + { + "epoch": 0.14, + "learning_rate": 0.0002899013647170229, + "loss": 0.3227, + "step": 196 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028977130386098523, + "loss": 0.5794, + "step": 197 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002896404403808942, + "loss": 0.5515, + "step": 198 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002895087750282215, + "loss": 0.2853, + "step": 199 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002893763085590436, + "loss": 0.3192, + "step": 200 + }, + { + "epoch": 0.15, + "eval_loss": 0.7430515885353088, + "eval_runtime": 13.0035, + "eval_samples_per_second": 0.846, + "eval_steps_per_second": 0.846, + "step": 200 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002892430417340369, + "loss": 0.4321, + "step": 201 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002891089753184744, + "loss": 0.2985, + "step": 202 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002889741100822202, + "loss": 0.2427, + "step": 203 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002888384467997257, + "loss": 0.5945, + "step": 204 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028870198625002516, + "loss": 0.6865, + "step": 205 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028856472921673067, + "loss": 0.3938, + "step": 206 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028842667648802846, + "loss": 0.5073, + "step": 207 + }, + { + "epoch": 0.15, + "learning_rate": 0.0002882878288566738, + "loss": 0.4465, + "step": 208 + }, + { + "epoch": 0.15, + "learning_rate": 0.00028814818711998667, + "loss": 0.3318, + "step": 209 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002880077520798471, + "loss": 0.3051, + "step": 210 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002878665245426905, + "loss": 0.5964, + "step": 211 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028772450531950325, + "loss": 0.6703, + "step": 212 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002875816952258179, + "loss": 0.5141, + "step": 213 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002874380950817085, + "loss": 0.3797, + "step": 214 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028729370571178587, + "loss": 0.4755, + "step": 215 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028714852794519286, + "loss": 0.2632, + "step": 216 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002870025626155996, + "loss": 0.5007, + "step": 217 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028685581056119887, + "loss": 0.5119, + "step": 218 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028670827262470097, + "loss": 0.3364, + "step": 219 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002865599496533292, + "loss": 0.3543, + "step": 220 + }, + { + "epoch": 0.16, + "learning_rate": 0.00028641084249881476, + "loss": 0.6461, + "step": 221 + }, + { + "epoch": 0.16, + "learning_rate": 0.000286260952017392, + "loss": 0.3578, + "step": 222 + }, + { + "epoch": 0.16, + "learning_rate": 0.0002861102790697934, + "loss": 0.5015, + "step": 223 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028595882452124474, + "loss": 0.2236, + "step": 224 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028580658924146004, + "loss": 0.6078, + "step": 225 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028565357410463663, + "loss": 0.745, + "step": 226 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028549977998945, + "loss": 0.4428, + "step": 227 + }, + { + "epoch": 0.17, + "learning_rate": 0.000285345207779049, + "loss": 0.4141, + "step": 228 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002851898583610505, + "loss": 0.2973, + "step": 229 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028503373262753456, + "loss": 0.3884, + "step": 230 + }, + { + "epoch": 0.17, + "learning_rate": 0.000284876831475039, + "loss": 0.4595, + "step": 231 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002847191558045544, + "loss": 0.7463, + "step": 232 + }, + { + "epoch": 0.17, + "learning_rate": 0.0002845607065215191, + "loss": 0.5798, + "step": 233 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028440148453581375, + "loss": 0.4011, + "step": 234 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028424149076175623, + "loss": 0.4742, + "step": 235 + }, + { + "epoch": 0.17, + "learning_rate": 0.00028408072611809624, + "loss": 0.3255, + "step": 236 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002839191915280102, + "loss": 0.4717, + "step": 237 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002837568879190958, + "loss": 0.4492, + "step": 238 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028359381622336684, + "loss": 0.4957, + "step": 239 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028342997737724785, + "loss": 0.5463, + "step": 240 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028326537232156853, + "loss": 0.3134, + "step": 241 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002831000020015585, + "loss": 0.4342, + "step": 242 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002829338673668418, + "loss": 0.2409, + "step": 243 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002827669693714315, + "loss": 0.4877, + "step": 244 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002825993089737243, + "loss": 0.6513, + "step": 245 + }, + { + "epoch": 0.18, + "learning_rate": 0.0002824308871364948, + "loss": 0.5354, + "step": 246 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028226170482689017, + "loss": 0.7068, + "step": 247 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028209176301642456, + "loss": 0.4029, + "step": 248 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028192106268097334, + "loss": 0.5173, + "step": 249 + }, + { + "epoch": 0.18, + "learning_rate": 0.00028174960480076776, + "loss": 0.3557, + "step": 250 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002815773903603891, + "loss": 0.2404, + "step": 251 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002814044203487633, + "loss": 0.3994, + "step": 252 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002812306957591549, + "loss": 0.4771, + "step": 253 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028105621758916165, + "loss": 0.6123, + "step": 254 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028088098684070866, + "loss": 0.6972, + "step": 255 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002807050045200426, + "loss": 0.6911, + "step": 256 + }, + { + "epoch": 0.19, + "learning_rate": 0.00028052827163772603, + "loss": 0.3621, + "step": 257 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002803507892086315, + "loss": 0.3574, + "step": 258 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002801725582519358, + "loss": 0.6216, + "step": 259 + }, + { + "epoch": 0.19, + "learning_rate": 0.0002799935797911141, + "loss": 0.4743, + "step": 260 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027981385485393386, + "loss": 0.473, + "step": 261 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027963338447244937, + "loss": 0.5412, + "step": 262 + }, + { + "epoch": 0.19, + "learning_rate": 0.00027945216968299543, + "loss": 0.2539, + "step": 263 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002792702115261816, + "loss": 0.1642, + "step": 264 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027908751104688604, + "loss": 0.4496, + "step": 265 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002789040692942497, + "loss": 0.7403, + "step": 266 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002787198873216703, + "loss": 0.5017, + "step": 267 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027853496618679614, + "loss": 0.7619, + "step": 268 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027834930695152, + "loss": 0.7091, + "step": 269 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002781629106819733, + "loss": 0.3183, + "step": 270 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027797577844851964, + "loss": 0.3979, + "step": 271 + }, + { + "epoch": 0.2, + "learning_rate": 0.000277787911325749, + "loss": 0.6251, + "step": 272 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027759931039247134, + "loss": 0.5845, + "step": 273 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027740997673171043, + "loss": 0.7068, + "step": 274 + }, + { + "epoch": 0.2, + "learning_rate": 0.00027721991143069767, + "loss": 0.372, + "step": 275 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002770291155808657, + "loss": 0.8528, + "step": 276 + }, + { + "epoch": 0.2, + "learning_rate": 0.0002768375902778426, + "loss": 0.3525, + "step": 277 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027664533662144497, + "loss": 0.2443, + "step": 278 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002764523557156721, + "loss": 0.6492, + "step": 279 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027625864866869925, + "loss": 0.568, + "step": 280 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027606421659287164, + "loss": 0.5191, + "step": 281 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002758690606046979, + "loss": 0.4665, + "step": 282 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002756731818248436, + "loss": 0.9246, + "step": 283 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002754765813781248, + "loss": 0.2443, + "step": 284 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027527926039350186, + "loss": 0.5107, + "step": 285 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002750812200040727, + "loss": 0.2779, + "step": 286 + }, + { + "epoch": 0.21, + "learning_rate": 0.0002748824613470662, + "loss": 0.3306, + "step": 287 + }, + { + "epoch": 0.21, + "learning_rate": 0.000274682985563836, + "loss": 0.4641, + "step": 288 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027448279379985375, + "loss": 0.3174, + "step": 289 + }, + { + "epoch": 0.21, + "learning_rate": 0.00027428188720470247, + "loss": 0.3527, + "step": 290 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027408026693207015, + "loss": 0.3878, + "step": 291 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002738779341397429, + "loss": 0.3586, + "step": 292 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027367488998959847, + "loss": 0.5559, + "step": 293 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027347113564759945, + "loss": 0.739, + "step": 294 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027326667228378673, + "loss": 0.2517, + "step": 295 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002730615010722727, + "loss": 0.3284, + "step": 296 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027285562319123447, + "loss": 0.7683, + "step": 297 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027264903982290704, + "loss": 0.777, + "step": 298 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027244175215357673, + "loss": 0.5519, + "step": 299 + }, + { + "epoch": 0.22, + "learning_rate": 0.00027223376137357425, + "loss": 0.6937, + "step": 300 + }, + { + "epoch": 0.22, + "eval_loss": 0.7280155420303345, + "eval_runtime": 13.0039, + "eval_samples_per_second": 0.846, + "eval_steps_per_second": 0.846, + "step": 300 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002720250686772677, + "loss": 0.5711, + "step": 301 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002718156752630561, + "loss": 0.2999, + "step": 302 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002716055823333622, + "loss": 0.551, + "step": 303 + }, + { + "epoch": 0.22, + "learning_rate": 0.0002713947910946255, + "loss": 0.5503, + "step": 304 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027118330275729567, + "loss": 0.3301, + "step": 305 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027097111853582526, + "loss": 0.4078, + "step": 306 + }, + { + "epoch": 0.23, + "learning_rate": 0.00027075823964866307, + "loss": 0.2179, + "step": 307 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002705446673182467, + "loss": 0.2818, + "step": 308 + }, + { + "epoch": 0.23, + "learning_rate": 0.000270330402770996, + "loss": 0.2603, + "step": 309 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002701154472373057, + "loss": 0.3626, + "step": 310 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002698998019515385, + "loss": 0.3824, + "step": 311 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002696834681520178, + "loss": 0.5088, + "step": 312 + }, + { + "epoch": 0.23, + "learning_rate": 0.0002694664470810211, + "loss": 0.5815, + "step": 313 + }, + { + "epoch": 0.23, + "learning_rate": 0.000269248739984772, + "loss": 0.3339, + "step": 314 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026903034811343393, + "loss": 0.4999, + "step": 315 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026881127272110236, + "loss": 0.5024, + "step": 316 + }, + { + "epoch": 0.23, + "learning_rate": 0.00026859151506579795, + "loss": 0.5164, + "step": 317 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026837107640945905, + "loss": 0.3513, + "step": 318 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002681499580179347, + "loss": 0.2102, + "step": 319 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002679281611609773, + "loss": 0.6105, + "step": 320 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026770568711223514, + "loss": 0.4672, + "step": 321 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026748253714924537, + "loss": 0.1856, + "step": 322 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026725871255342635, + "loss": 0.2632, + "step": 323 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002670342146100707, + "loss": 0.443, + "step": 324 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026680904460833733, + "loss": 0.2832, + "step": 325 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002665832038412447, + "loss": 0.4782, + "step": 326 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026635669360566296, + "loss": 0.5187, + "step": 327 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002661295152023066, + "loss": 0.3699, + "step": 328 + }, + { + "epoch": 0.24, + "learning_rate": 0.000265901669935727, + "loss": 0.3705, + "step": 329 + }, + { + "epoch": 0.24, + "learning_rate": 0.00026567315911430497, + "loss": 0.4908, + "step": 330 + }, + { + "epoch": 0.24, + "learning_rate": 0.0002654439840502433, + "loss": 0.4913, + "step": 331 + }, + { + "epoch": 0.25, + "learning_rate": 0.000265214146059559, + "loss": 0.3055, + "step": 332 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026498364646207576, + "loss": 0.3872, + "step": 333 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026475248658141686, + "loss": 0.3888, + "step": 334 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002645206677449968, + "loss": 0.5704, + "step": 335 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002642881912840144, + "loss": 0.4121, + "step": 336 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002640550585334446, + "loss": 0.1917, + "step": 337 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002638212708320312, + "loss": 0.7044, + "step": 338 + }, + { + "epoch": 0.25, + "learning_rate": 0.0002635868295222787, + "loss": 0.6185, + "step": 339 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026335173595044527, + "loss": 0.2487, + "step": 340 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026311599146653443, + "loss": 0.6175, + "step": 341 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026287959742428744, + "loss": 0.4304, + "step": 342 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026264255518117577, + "loss": 0.2045, + "step": 343 + }, + { + "epoch": 0.25, + "learning_rate": 0.00026240486609839297, + "loss": 0.5652, + "step": 344 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026216653154084713, + "loss": 0.5883, + "step": 345 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002619275528771528, + "loss": 0.2128, + "step": 346 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026168793147962334, + "loss": 0.3772, + "step": 347 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002614476687242629, + "loss": 0.4098, + "step": 348 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026120676599075866, + "loss": 0.3191, + "step": 349 + }, + { + "epoch": 0.26, + "learning_rate": 0.00026096522466247265, + "loss": 0.7856, + "step": 350 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002607230461264342, + "loss": 0.5807, + "step": 351 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002604802317733315, + "loss": 0.3925, + "step": 352 + }, + { + "epoch": 0.26, + "learning_rate": 0.000260236782997504, + "loss": 0.3968, + "step": 353 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002599927011969343, + "loss": 0.6506, + "step": 354 + }, + { + "epoch": 0.26, + "learning_rate": 0.00025974798777324, + "loss": 0.9081, + "step": 355 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002595026441316657, + "loss": 0.5228, + "step": 356 + }, + { + "epoch": 0.26, + "learning_rate": 0.0002592566716810752, + "loss": 0.3975, + "step": 357 + }, + { + "epoch": 0.26, + "learning_rate": 0.00025901007183394293, + "loss": 0.4149, + "step": 358 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002587628460063462, + "loss": 0.3914, + "step": 359 + }, + { + "epoch": 0.27, + "learning_rate": 0.00025851499561795684, + "loss": 0.8105, + "step": 360 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002582665220920334, + "loss": 0.6406, + "step": 361 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002580174268554125, + "loss": 0.7106, + "step": 362 + }, + { + "epoch": 0.27, + "learning_rate": 0.000257767711338501, + "loss": 0.484, + "step": 363 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002575173769752677, + "loss": 0.375, + "step": 364 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002572664252032349, + "loss": 0.2236, + "step": 365 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002570148574634705, + "loss": 0.3452, + "step": 366 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002567626752005794, + "loss": 0.2653, + "step": 367 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002565098798626955, + "loss": 0.6595, + "step": 368 + }, + { + "epoch": 0.27, + "learning_rate": 0.00025625647290147303, + "loss": 0.4133, + "step": 369 + }, + { + "epoch": 0.27, + "learning_rate": 0.00025600245577207857, + "loss": 0.5276, + "step": 370 + }, + { + "epoch": 0.27, + "learning_rate": 0.0002557478299331825, + "loss": 0.3904, + "step": 371 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002554925968469506, + "loss": 0.3788, + "step": 372 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025523675797903576, + "loss": 0.3834, + "step": 373 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002549803147985695, + "loss": 0.3585, + "step": 374 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025472326877815354, + "loss": 0.2435, + "step": 375 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002544656213938514, + "loss": 0.4661, + "step": 376 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002542073741251799, + "loss": 0.6525, + "step": 377 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025394852845510047, + "loss": 0.7845, + "step": 378 + }, + { + "epoch": 0.28, + "learning_rate": 0.000253689085870011, + "loss": 1.0962, + "step": 379 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025342904785973706, + "loss": 0.6062, + "step": 380 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025316841591752336, + "loss": 0.5632, + "step": 381 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025290719154002513, + "loss": 0.2323, + "step": 382 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002526453762272998, + "loss": 0.6669, + "step": 383 + }, + { + "epoch": 0.28, + "learning_rate": 0.0002523829714827981, + "loss": 0.3308, + "step": 384 + }, + { + "epoch": 0.28, + "learning_rate": 0.00025211997881335543, + "loss": 0.641, + "step": 385 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025185639972918346, + "loss": 0.5766, + "step": 386 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025159223574386114, + "loss": 0.5476, + "step": 387 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025132748837432625, + "loss": 0.3965, + "step": 388 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002510621591408666, + "loss": 0.6232, + "step": 389 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002507962495671113, + "loss": 0.4936, + "step": 390 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025052976118002194, + "loss": 0.4642, + "step": 391 + }, + { + "epoch": 0.29, + "learning_rate": 0.00025026269550988396, + "loss": 0.488, + "step": 392 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002499950540902978, + "loss": 0.8166, + "step": 393 + }, + { + "epoch": 0.29, + "learning_rate": 0.00024972683845817005, + "loss": 0.404, + "step": 394 + }, + { + "epoch": 0.29, + "learning_rate": 0.00024945805015370474, + "loss": 0.469, + "step": 395 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002491886907203943, + "loss": 0.5544, + "step": 396 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002489187617050109, + "loss": 0.3086, + "step": 397 + }, + { + "epoch": 0.29, + "learning_rate": 0.0002486482646575975, + "loss": 0.4426, + "step": 398 + }, + { + "epoch": 0.29, + "learning_rate": 0.00024837720113145886, + "loss": 0.5296, + "step": 399 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002481055726831528, + "loss": 0.345, + "step": 400 + }, + { + "epoch": 0.3, + "eval_loss": 0.7337626814842224, + "eval_runtime": 13.0144, + "eval_samples_per_second": 0.845, + "eval_steps_per_second": 0.845, + "step": 400 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002478333808724809, + "loss": 0.4889, + "step": 401 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002475606272624801, + "loss": 0.2944, + "step": 402 + }, + { + "epoch": 0.3, + "learning_rate": 0.00024728731341941337, + "loss": 0.3656, + "step": 403 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002470134409127606, + "loss": 0.4308, + "step": 404 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002467390113152099, + "loss": 0.3265, + "step": 405 + }, + { + "epoch": 0.3, + "learning_rate": 0.00024646402620264834, + "loss": 0.166, + "step": 406 + }, + { + "epoch": 0.3, + "learning_rate": 0.00024618848715415306, + "loss": 0.4762, + "step": 407 + }, + { + "epoch": 0.3, + "learning_rate": 0.00024591239575198206, + "loss": 0.6291, + "step": 408 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002456357535815652, + "loss": 0.5504, + "step": 409 + }, + { + "epoch": 0.3, + "learning_rate": 0.00024535856223149524, + "loss": 0.3328, + "step": 410 + }, + { + "epoch": 0.3, + "learning_rate": 0.0002450808232935181, + "loss": 0.5982, + "step": 411 + }, + { + "epoch": 0.3, + "learning_rate": 0.00024480253836252477, + "loss": 0.3765, + "step": 412 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024452370903654114, + "loss": 0.5533, + "step": 413 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024424433691671936, + "loss": 0.3661, + "step": 414 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024396442360732852, + "loss": 0.3451, + "step": 415 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024368397071574546, + "loss": 0.1914, + "step": 416 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024340297985244556, + "loss": 0.2141, + "step": 417 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024312145263099327, + "loss": 0.2406, + "step": 418 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024283939066803333, + "loss": 0.5337, + "step": 419 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024255679558328087, + "loss": 0.3128, + "step": 420 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024227366899951265, + "loss": 0.2907, + "step": 421 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024199001254255744, + "loss": 0.4188, + "step": 422 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002417058278412867, + "loss": 0.4719, + "step": 423 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024142111652760535, + "loss": 0.5886, + "step": 424 + }, + { + "epoch": 0.31, + "learning_rate": 0.00024113588023644237, + "loss": 0.5553, + "step": 425 + }, + { + "epoch": 0.31, + "learning_rate": 0.0002408501206057413, + "loss": 0.4051, + "step": 426 + }, + { + "epoch": 0.32, + "learning_rate": 0.00024056383927645094, + "loss": 0.4207, + "step": 427 + }, + { + "epoch": 0.32, + "learning_rate": 0.00024027703789251587, + "loss": 0.4121, + "step": 428 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002399897181008671, + "loss": 0.4287, + "step": 429 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002397018815514125, + "loss": 0.3167, + "step": 430 + }, + { + "epoch": 0.32, + "learning_rate": 0.00023941352989702733, + "loss": 0.4941, + "step": 431 + }, + { + "epoch": 0.32, + "learning_rate": 0.00023912466479354493, + "loss": 0.4612, + "step": 432 + }, + { + "epoch": 0.32, + "learning_rate": 0.000238835287899747, + "loss": 0.6483, + "step": 433 + }, + { + "epoch": 0.32, + "learning_rate": 0.00023854540087735407, + "loss": 0.7222, + "step": 434 + }, + { + "epoch": 0.32, + "learning_rate": 0.00023825500539101615, + "loss": 0.5733, + "step": 435 + }, + { + "epoch": 0.32, + "learning_rate": 0.000237964103108303, + "loss": 0.3045, + "step": 436 + }, + { + "epoch": 0.32, + "learning_rate": 0.0002376726956996947, + "loss": 0.6636, + "step": 437 + }, + { + "epoch": 0.32, + "learning_rate": 0.00023738078483857177, + "loss": 0.5089, + "step": 438 + }, + { + "epoch": 0.32, + "learning_rate": 0.00023708837220120602, + "loss": 0.2181, + "step": 439 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002367954594667505, + "loss": 0.4506, + "step": 440 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023650204831723008, + "loss": 0.6349, + "step": 441 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002362081404375316, + "loss": 0.5496, + "step": 442 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023591373751539454, + "loss": 0.6185, + "step": 443 + }, + { + "epoch": 0.33, + "learning_rate": 0.000235618841241401, + "loss": 0.4223, + "step": 444 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023532345330896617, + "loss": 0.3419, + "step": 445 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023502757541432837, + "loss": 0.239, + "step": 446 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023473120925653973, + "loss": 0.6236, + "step": 447 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023443435653745605, + "loss": 0.6115, + "step": 448 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023413701896172714, + "loss": 0.416, + "step": 449 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023383919823678713, + "loss": 0.3178, + "step": 450 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002335408960728445, + "loss": 0.3594, + "step": 451 + }, + { + "epoch": 0.33, + "learning_rate": 0.0002332421141828725, + "loss": 0.4471, + "step": 452 + }, + { + "epoch": 0.33, + "learning_rate": 0.00023294285428259904, + "loss": 0.2383, + "step": 453 + }, + { + "epoch": 0.34, + "learning_rate": 0.000232643118090497, + "loss": 0.3883, + "step": 454 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002323429073277743, + "loss": 0.5119, + "step": 455 + }, + { + "epoch": 0.34, + "learning_rate": 0.00023204222371836405, + "loss": 0.2078, + "step": 456 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002317410689889146, + "loss": 0.3848, + "step": 457 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002314394448687798, + "loss": 0.5993, + "step": 458 + }, + { + "epoch": 0.34, + "learning_rate": 0.00023113735309000886, + "loss": 0.539, + "step": 459 + }, + { + "epoch": 0.34, + "learning_rate": 0.00023083479538733634, + "loss": 0.4248, + "step": 460 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002305317734981725, + "loss": 0.3614, + "step": 461 + }, + { + "epoch": 0.34, + "learning_rate": 0.00023022828916259306, + "loss": 0.5128, + "step": 462 + }, + { + "epoch": 0.34, + "learning_rate": 0.00022992434412332942, + "loss": 0.4425, + "step": 463 + }, + { + "epoch": 0.34, + "learning_rate": 0.0002296199401257584, + "loss": 0.3218, + "step": 464 + }, + { + "epoch": 0.34, + "learning_rate": 0.00022931507891789239, + "loss": 0.3231, + "step": 465 + }, + { + "epoch": 0.34, + "learning_rate": 0.00022900976225036923, + "loss": 0.7137, + "step": 466 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022870399187644225, + "loss": 0.3365, + "step": 467 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022839776955197014, + "loss": 0.9577, + "step": 468 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022809109703540682, + "loss": 0.2926, + "step": 469 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002277839760877915, + "loss": 0.2721, + "step": 470 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022747640847273838, + "loss": 0.3, + "step": 471 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022716839595642664, + "loss": 0.5151, + "step": 472 + }, + { + "epoch": 0.35, + "learning_rate": 0.0002268599403075903, + "loss": 0.4312, + "step": 473 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022655104329750785, + "loss": 0.2957, + "step": 474 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022624170669999252, + "loss": 0.5483, + "step": 475 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022593193229138167, + "loss": 0.3159, + "step": 476 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022562172185052677, + "loss": 0.2589, + "step": 477 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022531107715878323, + "loss": 0.3596, + "step": 478 + }, + { + "epoch": 0.35, + "learning_rate": 0.000225, + "loss": 0.2969, + "step": 479 + }, + { + "epoch": 0.35, + "learning_rate": 0.00022468849216050952, + "loss": 0.5015, + "step": 480 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022437655542911729, + "loss": 0.6868, + "step": 481 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022406419159709184, + "loss": 0.3607, + "step": 482 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022375140245815405, + "loss": 0.801, + "step": 483 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022343818980846738, + "loss": 0.3641, + "step": 484 + }, + { + "epoch": 0.36, + "learning_rate": 0.000223124555446627, + "loss": 0.553, + "step": 485 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022281050117364982, + "loss": 0.874, + "step": 486 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022249602879296414, + "loss": 0.2427, + "step": 487 + }, + { + "epoch": 0.36, + "learning_rate": 0.0002221811401103991, + "loss": 0.2261, + "step": 488 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022186583693417433, + "loss": 0.4712, + "step": 489 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022155012107488988, + "loss": 0.6684, + "step": 490 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022123399434551545, + "loss": 0.4416, + "step": 491 + }, + { + "epoch": 0.36, + "learning_rate": 0.00022091745856138014, + "loss": 0.349, + "step": 492 + }, + { + "epoch": 0.36, + "learning_rate": 0.000220600515540162, + "loss": 0.2384, + "step": 493 + }, + { + "epoch": 0.37, + "learning_rate": 0.00022028316710187768, + "loss": 0.4482, + "step": 494 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021996541506887185, + "loss": 0.4404, + "step": 495 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021964726126580668, + "loss": 0.2518, + "step": 496 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021932870751965167, + "loss": 0.2758, + "step": 497 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002190097556596728, + "loss": 0.6988, + "step": 498 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002186904075174223, + "loss": 0.3609, + "step": 499 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002183706649267279, + "loss": 0.5053, + "step": 500 + }, + { + "epoch": 0.37, + "eval_loss": 0.7127830386161804, + "eval_runtime": 12.994, + "eval_samples_per_second": 0.847, + "eval_steps_per_second": 0.847, + "step": 500 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021805052972368244, + "loss": 0.6729, + "step": 501 + }, + { + "epoch": 0.37, + "learning_rate": 0.0002177300037466334, + "loss": 0.529, + "step": 502 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021740908883617217, + "loss": 0.5612, + "step": 503 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021708778683512345, + "loss": 0.475, + "step": 504 + }, + { + "epoch": 0.37, + "learning_rate": 0.000216766099588535, + "loss": 0.548, + "step": 505 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021644402894366668, + "loss": 0.3165, + "step": 506 + }, + { + "epoch": 0.37, + "learning_rate": 0.00021612157674998, + "loss": 0.2605, + "step": 507 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021579874485912743, + "loss": 0.4271, + "step": 508 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021547553512494186, + "loss": 0.4293, + "step": 509 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021515194940342598, + "loss": 0.4034, + "step": 510 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021482798955274146, + "loss": 0.3694, + "step": 511 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002145036574331984, + "loss": 0.1641, + "step": 512 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002141789549072446, + "loss": 0.6786, + "step": 513 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021385388383945493, + "loss": 0.5022, + "step": 514 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021352844609652056, + "loss": 0.5273, + "step": 515 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021320264354723824, + "loss": 0.6755, + "step": 516 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002128764780624996, + "loss": 0.1676, + "step": 517 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021254995151528041, + "loss": 0.4453, + "step": 518 + }, + { + "epoch": 0.38, + "learning_rate": 0.0002122230657806298, + "loss": 0.3622, + "step": 519 + }, + { + "epoch": 0.38, + "learning_rate": 0.00021189582273565942, + "loss": 0.8535, + "step": 520 + }, + { + "epoch": 0.39, + "learning_rate": 0.00021156822425953293, + "loss": 0.4528, + "step": 521 + }, + { + "epoch": 0.39, + "learning_rate": 0.00021124027223345487, + "loss": 0.3618, + "step": 522 + }, + { + "epoch": 0.39, + "learning_rate": 0.00021091196854066001, + "loss": 0.2572, + "step": 523 + }, + { + "epoch": 0.39, + "learning_rate": 0.00021058331506640255, + "loss": 0.5138, + "step": 524 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002102543136979454, + "loss": 0.4255, + "step": 525 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020992496632454908, + "loss": 0.6983, + "step": 526 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020959527483746098, + "loss": 0.4843, + "step": 527 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020926524112990465, + "loss": 0.708, + "step": 528 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020893486709706887, + "loss": 0.5213, + "step": 529 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020860415463609647, + "loss": 0.5286, + "step": 530 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020827310564607384, + "loss": 0.6407, + "step": 531 + }, + { + "epoch": 0.39, + "learning_rate": 0.00020794172202801976, + "loss": 0.7612, + "step": 532 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002076100056848748, + "loss": 0.2932, + "step": 533 + }, + { + "epoch": 0.39, + "learning_rate": 0.0002072779585214899, + "loss": 0.693, + "step": 534 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020694558244461589, + "loss": 0.5049, + "step": 535 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020661287936289213, + "loss": 0.2617, + "step": 536 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020627985118683615, + "loss": 0.6146, + "step": 537 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020594649982883198, + "loss": 0.2204, + "step": 538 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020561282720311954, + "loss": 0.2412, + "step": 539 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002052788352257838, + "loss": 0.6613, + "step": 540 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020494452581474337, + "loss": 0.2795, + "step": 541 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020460990088973975, + "loss": 0.2773, + "step": 542 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020427496237232626, + "loss": 0.1297, + "step": 543 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020393971218585704, + "loss": 0.5855, + "step": 544 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020360415225547594, + "loss": 0.6115, + "step": 545 + }, + { + "epoch": 0.4, + "learning_rate": 0.00020326828450810544, + "loss": 0.6751, + "step": 546 + }, + { + "epoch": 0.4, + "learning_rate": 0.0002029321108724356, + "loss": 0.3769, + "step": 547 + }, + { + "epoch": 0.41, + "learning_rate": 0.00020259563327891316, + "loss": 0.4872, + "step": 548 + }, + { + "epoch": 0.41, + "learning_rate": 0.00020225885365973017, + "loss": 0.2956, + "step": 549 + }, + { + "epoch": 0.41, + "learning_rate": 0.00020192177394881317, + "loss": 0.1986, + "step": 550 + }, + { + "epoch": 0.41, + "learning_rate": 0.00020158439608181173, + "loss": 0.5691, + "step": 551 + }, + { + "epoch": 0.41, + "learning_rate": 0.00020124672199608787, + "loss": 0.4556, + "step": 552 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002009087536307043, + "loss": 0.5757, + "step": 553 + }, + { + "epoch": 0.41, + "learning_rate": 0.00020057049292641386, + "loss": 0.5785, + "step": 554 + }, + { + "epoch": 0.41, + "learning_rate": 0.0002002319418256479, + "loss": 0.7793, + "step": 555 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019989310227250555, + "loss": 0.6425, + "step": 556 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019955397621274217, + "loss": 0.326, + "step": 557 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019921456559375854, + "loss": 0.5331, + "step": 558 + }, + { + "epoch": 0.41, + "learning_rate": 0.0001988748723645893, + "loss": 0.4126, + "step": 559 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019853489847589212, + "loss": 0.2713, + "step": 560 + }, + { + "epoch": 0.41, + "learning_rate": 0.00019819464587993624, + "loss": 0.5311, + "step": 561 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019785411653059138, + "loss": 0.1802, + "step": 562 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019751331238331644, + "loss": 0.2863, + "step": 563 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019717223539514846, + "loss": 0.2984, + "step": 564 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001968308875246911, + "loss": 0.3554, + "step": 565 + }, + { + "epoch": 0.42, + "learning_rate": 0.0001964892707321036, + "loss": 0.7062, + "step": 566 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019614738697908946, + "loss": 0.5257, + "step": 567 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019580523822888512, + "loss": 0.4245, + "step": 568 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019546282644624885, + "loss": 0.6944, + "step": 569 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019512015359744923, + "loss": 0.7338, + "step": 570 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019477722165025418, + "loss": 0.3573, + "step": 571 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019443403257391923, + "loss": 0.2873, + "step": 572 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019409058833917665, + "loss": 0.7692, + "step": 573 + }, + { + "epoch": 0.42, + "learning_rate": 0.00019374689091822373, + "loss": 0.6185, + "step": 574 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019340294228471192, + "loss": 0.2507, + "step": 575 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019305874441373502, + "loss": 0.4036, + "step": 576 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019271429928181816, + "loss": 0.2384, + "step": 577 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019236960886690626, + "loss": 0.3416, + "step": 578 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001920246751483529, + "loss": 0.4421, + "step": 579 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019167950010690868, + "loss": 0.3397, + "step": 580 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019133408572470997, + "loss": 0.3132, + "step": 581 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019098843398526765, + "loss": 0.4452, + "step": 582 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019064254687345544, + "loss": 0.3581, + "step": 583 + }, + { + "epoch": 0.43, + "learning_rate": 0.00019029642637549885, + "loss": 0.2781, + "step": 584 + }, + { + "epoch": 0.43, + "learning_rate": 0.0001899500744789634, + "loss": 0.217, + "step": 585 + }, + { + "epoch": 0.43, + "learning_rate": 0.00018960349317274355, + "loss": 0.6642, + "step": 586 + }, + { + "epoch": 0.43, + "learning_rate": 0.00018925668444705104, + "loss": 0.7822, + "step": 587 + }, + { + "epoch": 0.43, + "learning_rate": 0.00018890965029340353, + "loss": 0.394, + "step": 588 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001885623927046133, + "loss": 0.6322, + "step": 589 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001882149136747755, + "loss": 0.3221, + "step": 590 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001878672151992571, + "loss": 0.5053, + "step": 591 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018751929927468504, + "loss": 0.2439, + "step": 592 + }, + { + "epoch": 0.44, + "learning_rate": 0.000187171167898935, + "loss": 0.2368, + "step": 593 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018682282307111987, + "loss": 0.541, + "step": 594 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001864742667915783, + "loss": 0.2163, + "step": 595 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018612550106186318, + "loss": 0.3043, + "step": 596 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018577652788473003, + "loss": 0.4656, + "step": 597 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001854273492641258, + "loss": 0.5907, + "step": 598 + }, + { + "epoch": 0.44, + "learning_rate": 0.0001850779672051771, + "loss": 0.2994, + "step": 599 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018472838371417873, + "loss": 0.4392, + "step": 600 + }, + { + "epoch": 0.44, + "eval_loss": 0.7241503000259399, + "eval_runtime": 12.9988, + "eval_samples_per_second": 0.846, + "eval_steps_per_second": 0.846, + "step": 600 + }, + { + "epoch": 0.44, + "learning_rate": 0.00018437860079858226, + "loss": 0.1327, + "step": 601 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001840286204669843, + "loss": 0.7039, + "step": 602 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018367844472911532, + "loss": 0.3143, + "step": 603 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018332807559582774, + "loss": 0.4041, + "step": 604 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018297751507908452, + "loss": 0.4026, + "step": 605 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018262676519194778, + "loss": 0.3177, + "step": 606 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001822758279485669, + "loss": 0.4313, + "step": 607 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018192470536416728, + "loss": 0.5299, + "step": 608 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018157339945503847, + "loss": 0.3415, + "step": 609 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001812219122385229, + "loss": 0.4785, + "step": 610 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001808702457330041, + "loss": 0.373, + "step": 611 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018051840195789506, + "loss": 0.7851, + "step": 612 + }, + { + "epoch": 0.45, + "learning_rate": 0.00018016638293362683, + "loss": 0.4122, + "step": 613 + }, + { + "epoch": 0.45, + "learning_rate": 0.0001798141906816368, + "loss": 0.473, + "step": 614 + }, + { + "epoch": 0.45, + "learning_rate": 0.00017946182722435703, + "loss": 0.5523, + "step": 615 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017910929458520274, + "loss": 0.429, + "step": 616 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017875659478856076, + "loss": 0.7252, + "step": 617 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001784037298597777, + "loss": 0.8911, + "step": 618 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001780507018251484, + "loss": 0.2871, + "step": 619 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017769751271190444, + "loss": 0.3168, + "step": 620 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017734416454820225, + "loss": 0.6512, + "step": 621 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017699065936311174, + "loss": 0.7975, + "step": 622 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017663699918660442, + "loss": 0.6918, + "step": 623 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017628318604954175, + "loss": 0.5236, + "step": 624 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017592922198366364, + "loss": 0.2322, + "step": 625 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001755751090215767, + "loss": 0.4731, + "step": 626 + }, + { + "epoch": 0.46, + "learning_rate": 0.00017522084919674254, + "loss": 0.2575, + "step": 627 + }, + { + "epoch": 0.46, + "learning_rate": 0.0001748664445434661, + "loss": 0.2666, + "step": 628 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017451189709688403, + "loss": 0.1195, + "step": 629 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017415720889295294, + "loss": 0.4373, + "step": 630 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017380238196843767, + "loss": 0.3413, + "step": 631 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017344741836089974, + "loss": 0.7706, + "step": 632 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001730923201086855, + "loss": 0.479, + "step": 633 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017273708925091453, + "loss": 0.3097, + "step": 634 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017238172782746782, + "loss": 0.3742, + "step": 635 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017202623787897616, + "loss": 0.4239, + "step": 636 + }, + { + "epoch": 0.47, + "learning_rate": 0.0001716706214468084, + "loss": 0.5373, + "step": 637 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017131488057305967, + "loss": 0.3712, + "step": 638 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017095901730053976, + "loss": 0.4058, + "step": 639 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017060303367276121, + "loss": 0.1766, + "step": 640 + }, + { + "epoch": 0.47, + "learning_rate": 0.00017024693173392784, + "loss": 0.8582, + "step": 641 + }, + { + "epoch": 0.47, + "learning_rate": 0.00016989071352892273, + "loss": 0.3212, + "step": 642 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016953438110329675, + "loss": 0.1797, + "step": 643 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016917793650325644, + "loss": 0.6375, + "step": 644 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016882138177565272, + "loss": 0.9199, + "step": 645 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001684647189679688, + "loss": 0.3762, + "step": 646 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001681079501283086, + "loss": 0.5697, + "step": 647 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016775107730538475, + "loss": 0.2191, + "step": 648 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016739410254850725, + "loss": 0.3447, + "step": 649 + }, + { + "epoch": 0.48, + "learning_rate": 0.0001670370279075712, + "loss": 0.7809, + "step": 650 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016667985543304548, + "loss": 0.5296, + "step": 651 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016632258717596057, + "loss": 0.3256, + "step": 652 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016596522518789718, + "loss": 0.5935, + "step": 653 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016560777152097413, + "loss": 0.4552, + "step": 654 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016525022822783667, + "loss": 0.6108, + "step": 655 + }, + { + "epoch": 0.48, + "learning_rate": 0.00016489259736164483, + "loss": 0.2999, + "step": 656 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001645348809760615, + "loss": 0.3985, + "step": 657 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016417708112524056, + "loss": 0.3948, + "step": 658 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016381919986381525, + "loss": 0.305, + "step": 659 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016346123924688638, + "loss": 0.5075, + "step": 660 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016310320133001028, + "loss": 0.4921, + "step": 661 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016274508816918734, + "loss": 0.352, + "step": 662 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016238690182084986, + "loss": 0.1758, + "step": 663 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016202864434185053, + "loss": 0.5357, + "step": 664 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016167031778945056, + "loss": 0.3795, + "step": 665 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016131192422130764, + "loss": 0.3828, + "step": 666 + }, + { + "epoch": 0.49, + "learning_rate": 0.0001609534656954644, + "loss": 0.4688, + "step": 667 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016059494427033647, + "loss": 0.5511, + "step": 668 + }, + { + "epoch": 0.49, + "learning_rate": 0.00016023636200470065, + "loss": 0.6362, + "step": 669 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001598777209576831, + "loss": 0.3887, + "step": 670 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015951902318874758, + "loss": 0.5006, + "step": 671 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015916027075768354, + "loss": 0.5082, + "step": 672 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015880146572459433, + "loss": 0.3726, + "step": 673 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015844261014988535, + "loss": 0.5567, + "step": 674 + }, + { + "epoch": 0.5, + "learning_rate": 0.0001580837060942522, + "loss": 0.7998, + "step": 675 + }, + { + "epoch": 0.5, + "learning_rate": 0.000157724755618669, + "loss": 0.2156, + "step": 676 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015736576078437625, + "loss": 0.3988, + "step": 677 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015700672365286932, + "loss": 0.898, + "step": 678 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015664764628588637, + "loss": 0.3017, + "step": 679 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015628853074539675, + "loss": 0.4727, + "step": 680 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015592937909358887, + "loss": 0.9664, + "step": 681 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015557019339285853, + "loss": 0.4321, + "step": 682 + }, + { + "epoch": 0.5, + "learning_rate": 0.00015521097570579716, + "loss": 0.3144, + "step": 683 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015485172809517978, + "loss": 0.352, + "step": 684 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015449245262395323, + "loss": 0.9209, + "step": 685 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015413315135522432, + "loss": 0.3722, + "step": 686 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015377382635224806, + "loss": 0.8169, + "step": 687 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015341447967841582, + "loss": 0.586, + "step": 688 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015305511339724331, + "loss": 0.4594, + "step": 689 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015269572957235877, + "loss": 0.4635, + "step": 690 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015233633026749132, + "loss": 0.4394, + "step": 691 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015197691754645893, + "loss": 0.7235, + "step": 692 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015161749347315657, + "loss": 0.4175, + "step": 693 + }, + { + "epoch": 0.51, + "learning_rate": 0.0001512580601115444, + "loss": 0.2788, + "step": 694 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015089861952563604, + "loss": 0.4044, + "step": 695 + }, + { + "epoch": 0.51, + "learning_rate": 0.00015053917377948642, + "loss": 0.4199, + "step": 696 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001501797249371802, + "loss": 0.2754, + "step": 697 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001498202750628198, + "loss": 0.3196, + "step": 698 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014946082622051352, + "loss": 0.2727, + "step": 699 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014910138047436394, + "loss": 0.4222, + "step": 700 + }, + { + "epoch": 0.52, + "eval_loss": 0.718849241733551, + "eval_runtime": 12.9984, + "eval_samples_per_second": 0.846, + "eval_steps_per_second": 0.846, + "step": 700 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014874193988845553, + "loss": 0.5825, + "step": 701 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014838250652684344, + "loss": 0.4383, + "step": 702 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014802308245354104, + "loss": 0.3608, + "step": 703 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014766366973250865, + "loss": 0.6402, + "step": 704 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001473042704276412, + "loss": 0.6717, + "step": 705 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014694488660275666, + "loss": 0.4606, + "step": 706 + }, + { + "epoch": 0.52, + "learning_rate": 0.00014658552032158412, + "loss": 0.3563, + "step": 707 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001462261736477519, + "loss": 0.3345, + "step": 708 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001458668486447757, + "loss": 0.2947, + "step": 709 + }, + { + "epoch": 0.52, + "learning_rate": 0.0001455075473760468, + "loss": 0.4221, + "step": 710 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014514827190482022, + "loss": 0.7063, + "step": 711 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001447890242942028, + "loss": 0.4138, + "step": 712 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014442980660714147, + "loss": 0.2716, + "step": 713 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001440706209064111, + "loss": 0.522, + "step": 714 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014371146925460325, + "loss": 0.2471, + "step": 715 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014335235371411363, + "loss": 0.2515, + "step": 716 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014299327634713068, + "loss": 0.325, + "step": 717 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014263423921562375, + "loss": 0.3559, + "step": 718 + }, + { + "epoch": 0.53, + "learning_rate": 0.000142275244381331, + "loss": 0.311, + "step": 719 + }, + { + "epoch": 0.53, + "learning_rate": 0.0001419162939057478, + "loss": 0.3588, + "step": 720 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014155738985011462, + "loss": 0.4606, + "step": 721 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014119853427540564, + "loss": 0.4991, + "step": 722 + }, + { + "epoch": 0.53, + "learning_rate": 0.00014083972924231643, + "loss": 0.42, + "step": 723 + }, + { + "epoch": 0.54, + "learning_rate": 0.00014048097681125242, + "loss": 0.2433, + "step": 724 + }, + { + "epoch": 0.54, + "learning_rate": 0.00014012227904231688, + "loss": 0.5278, + "step": 725 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013976363799529936, + "loss": 0.4191, + "step": 726 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013940505572966353, + "loss": 0.6402, + "step": 727 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001390465343045356, + "loss": 0.2295, + "step": 728 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013868807577869233, + "loss": 0.3456, + "step": 729 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013832968221054944, + "loss": 0.3635, + "step": 730 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013797135565814944, + "loss": 0.4169, + "step": 731 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013761309817915014, + "loss": 0.441, + "step": 732 + }, + { + "epoch": 0.54, + "learning_rate": 0.0001372549118308127, + "loss": 0.4768, + "step": 733 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013689679866998972, + "loss": 0.8404, + "step": 734 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013653876075311365, + "loss": 0.5179, + "step": 735 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013618080013618473, + "loss": 0.4933, + "step": 736 + }, + { + "epoch": 0.54, + "learning_rate": 0.00013582291887475947, + "loss": 0.4747, + "step": 737 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013546511902393852, + "loss": 0.8171, + "step": 738 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013510740263835517, + "loss": 0.4309, + "step": 739 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013474977177216333, + "loss": 0.304, + "step": 740 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001343922284790259, + "loss": 0.469, + "step": 741 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013403477481210285, + "loss": 0.5643, + "step": 742 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013367741282403937, + "loss": 0.9558, + "step": 743 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001333201445669545, + "loss": 0.3965, + "step": 744 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013296297209242872, + "loss": 0.409, + "step": 745 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013260589745149273, + "loss": 0.5015, + "step": 746 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001322489226946152, + "loss": 0.4645, + "step": 747 + }, + { + "epoch": 0.55, + "learning_rate": 0.0001318920498716914, + "loss": 0.2994, + "step": 748 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013153528103203115, + "loss": 0.4572, + "step": 749 + }, + { + "epoch": 0.55, + "learning_rate": 0.00013117861822434723, + "loss": 0.556, + "step": 750 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013082206349674354, + "loss": 0.4779, + "step": 751 + }, + { + "epoch": 0.56, + "learning_rate": 0.00013046561889670327, + "loss": 0.3453, + "step": 752 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001301092864710772, + "loss": 1.0891, + "step": 753 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012975306826607213, + "loss": 0.5277, + "step": 754 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012939696632723876, + "loss": 0.3465, + "step": 755 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012904098269946024, + "loss": 0.3665, + "step": 756 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001286851194269403, + "loss": 0.4635, + "step": 757 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012832937855319158, + "loss": 0.4174, + "step": 758 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012797376212102387, + "loss": 0.847, + "step": 759 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012761827217253218, + "loss": 0.2295, + "step": 760 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012726291074908544, + "loss": 0.6366, + "step": 761 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012690767989131447, + "loss": 0.2403, + "step": 762 + }, + { + "epoch": 0.56, + "learning_rate": 0.00012655258163910024, + "loss": 0.6297, + "step": 763 + }, + { + "epoch": 0.56, + "learning_rate": 0.0001261976180315623, + "loss": 0.4264, + "step": 764 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012584279110704704, + "loss": 0.7399, + "step": 765 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012548810290311595, + "loss": 0.4965, + "step": 766 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012513355545653391, + "loss": 0.5037, + "step": 767 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012477915080325743, + "loss": 0.4081, + "step": 768 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001244248909784233, + "loss": 0.447, + "step": 769 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012407077801633636, + "loss": 0.5114, + "step": 770 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012371681395045826, + "loss": 0.4465, + "step": 771 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012336300081339558, + "loss": 0.6708, + "step": 772 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012300934063688823, + "loss": 0.4611, + "step": 773 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012265583545179772, + "loss": 0.3486, + "step": 774 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012230248728809556, + "loss": 0.4921, + "step": 775 + }, + { + "epoch": 0.57, + "learning_rate": 0.0001219492981748516, + "loss": 0.4282, + "step": 776 + }, + { + "epoch": 0.57, + "learning_rate": 0.00012159627014022233, + "loss": 0.7954, + "step": 777 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012124340521143926, + "loss": 0.6251, + "step": 778 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012089070541479726, + "loss": 0.4673, + "step": 779 + }, + { + "epoch": 0.58, + "learning_rate": 0.000120538172775643, + "loss": 0.3138, + "step": 780 + }, + { + "epoch": 0.58, + "learning_rate": 0.00012018580931836324, + "loss": 0.446, + "step": 781 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011983361706637317, + "loss": 0.5042, + "step": 782 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011948159804210495, + "loss": 0.3662, + "step": 783 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011912975426699593, + "loss": 0.4054, + "step": 784 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001187780877614771, + "loss": 0.4903, + "step": 785 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011842660054496153, + "loss": 0.1645, + "step": 786 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011807529463583275, + "loss": 0.3971, + "step": 787 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011772417205143306, + "loss": 0.4045, + "step": 788 + }, + { + "epoch": 0.58, + "learning_rate": 0.0001173732348080522, + "loss": 0.3736, + "step": 789 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011702248492091544, + "loss": 0.3747, + "step": 790 + }, + { + "epoch": 0.58, + "learning_rate": 0.00011667192440417226, + "loss": 0.4383, + "step": 791 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011632155527088465, + "loss": 0.2069, + "step": 792 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011597137953301566, + "loss": 0.3446, + "step": 793 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011562139920141774, + "loss": 0.4467, + "step": 794 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001152716162858212, + "loss": 0.3552, + "step": 795 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011492203279482283, + "loss": 0.3891, + "step": 796 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011457265073587415, + "loss": 0.2504, + "step": 797 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011422347211526993, + "loss": 0.4427, + "step": 798 + }, + { + "epoch": 0.59, + "learning_rate": 0.0001138744989381368, + "loss": 0.4597, + "step": 799 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011352573320842167, + "loss": 0.6739, + "step": 800 + }, + { + "epoch": 0.59, + "eval_loss": 0.6845341324806213, + "eval_runtime": 13.0104, + "eval_samples_per_second": 0.845, + "eval_steps_per_second": 0.845, + "step": 800 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011317717692888012, + "loss": 0.3329, + "step": 801 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011282883210106502, + "loss": 0.3141, + "step": 802 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011248070072531497, + "loss": 0.6187, + "step": 803 + }, + { + "epoch": 0.59, + "learning_rate": 0.00011213278480074288, + "loss": 0.3025, + "step": 804 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011178508632522448, + "loss": 0.3407, + "step": 805 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011143760729538668, + "loss": 0.2956, + "step": 806 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011109034970659644, + "loss": 0.269, + "step": 807 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011074331555294897, + "loss": 0.2612, + "step": 808 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011039650682725646, + "loss": 0.3399, + "step": 809 + }, + { + "epoch": 0.6, + "learning_rate": 0.00011004992552103656, + "loss": 0.4474, + "step": 810 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010970357362450115, + "loss": 0.6197, + "step": 811 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010935745312654454, + "loss": 0.2347, + "step": 812 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010901156601473236, + "loss": 0.5961, + "step": 813 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010866591427528999, + "loss": 0.5043, + "step": 814 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010832049989309131, + "loss": 0.5384, + "step": 815 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010797532485164707, + "loss": 0.5576, + "step": 816 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010763039113309371, + "loss": 0.4151, + "step": 817 + }, + { + "epoch": 0.6, + "learning_rate": 0.00010728570071818184, + "loss": 0.281, + "step": 818 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010694125558626498, + "loss": 0.389, + "step": 819 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001065970577152881, + "loss": 0.3531, + "step": 820 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010625310908177625, + "loss": 0.2959, + "step": 821 + }, + { + "epoch": 0.61, + "learning_rate": 0.0001059094116608234, + "loss": 0.4179, + "step": 822 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010556596742608078, + "loss": 0.4439, + "step": 823 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010522277834974585, + "loss": 0.2222, + "step": 824 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010487984640255073, + "loss": 0.5375, + "step": 825 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010453717355375116, + "loss": 0.3486, + "step": 826 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010419476177111488, + "loss": 0.4632, + "step": 827 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010385261302091059, + "loss": 0.526, + "step": 828 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010351072926789641, + "loss": 0.3489, + "step": 829 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010316911247530891, + "loss": 0.3223, + "step": 830 + }, + { + "epoch": 0.61, + "learning_rate": 0.00010282776460485157, + "loss": 0.9132, + "step": 831 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010248668761668357, + "loss": 0.7139, + "step": 832 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010214588346940867, + "loss": 0.288, + "step": 833 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010180535412006374, + "loss": 0.2817, + "step": 834 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010146510152410786, + "loss": 0.4616, + "step": 835 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010112512763541068, + "loss": 0.6379, + "step": 836 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010078543440624147, + "loss": 0.3136, + "step": 837 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010044602378725777, + "loss": 0.2365, + "step": 838 + }, + { + "epoch": 0.62, + "learning_rate": 0.00010010689772749444, + "loss": 0.3048, + "step": 839 + }, + { + "epoch": 0.62, + "learning_rate": 9.976805817435207e-05, + "loss": 0.3529, + "step": 840 + }, + { + "epoch": 0.62, + "learning_rate": 9.942950707358615e-05, + "loss": 0.342, + "step": 841 + }, + { + "epoch": 0.62, + "learning_rate": 9.909124636929564e-05, + "loss": 0.444, + "step": 842 + }, + { + "epoch": 0.62, + "learning_rate": 9.875327800391212e-05, + "loss": 0.3445, + "step": 843 + }, + { + "epoch": 0.62, + "learning_rate": 9.841560391818823e-05, + "loss": 0.3177, + "step": 844 + }, + { + "epoch": 0.62, + "learning_rate": 9.807822605118682e-05, + "loss": 0.244, + "step": 845 + }, + { + "epoch": 0.63, + "learning_rate": 9.774114634026979e-05, + "loss": 0.8378, + "step": 846 + }, + { + "epoch": 0.63, + "learning_rate": 9.740436672108685e-05, + "loss": 0.4002, + "step": 847 + }, + { + "epoch": 0.63, + "learning_rate": 9.706788912756442e-05, + "loss": 0.516, + "step": 848 + }, + { + "epoch": 0.63, + "learning_rate": 9.673171549189457e-05, + "loss": 0.4076, + "step": 849 + }, + { + "epoch": 0.63, + "learning_rate": 9.639584774452405e-05, + "loss": 0.3068, + "step": 850 + }, + { + "epoch": 0.63, + "learning_rate": 9.606028781414292e-05, + "loss": 0.3099, + "step": 851 + }, + { + "epoch": 0.63, + "learning_rate": 9.572503762767372e-05, + "loss": 0.5688, + "step": 852 + }, + { + "epoch": 0.63, + "learning_rate": 9.539009911026025e-05, + "loss": 0.4554, + "step": 853 + }, + { + "epoch": 0.63, + "learning_rate": 9.505547418525664e-05, + "loss": 0.2936, + "step": 854 + }, + { + "epoch": 0.63, + "learning_rate": 9.472116477421619e-05, + "loss": 0.2868, + "step": 855 + }, + { + "epoch": 0.63, + "learning_rate": 9.438717279688044e-05, + "loss": 0.7902, + "step": 856 + }, + { + "epoch": 0.63, + "learning_rate": 9.405350017116802e-05, + "loss": 0.5916, + "step": 857 + }, + { + "epoch": 0.63, + "learning_rate": 9.372014881316385e-05, + "loss": 0.1412, + "step": 858 + }, + { + "epoch": 0.64, + "learning_rate": 9.338712063710785e-05, + "loss": 0.8575, + "step": 859 + }, + { + "epoch": 0.64, + "learning_rate": 9.305441755538413e-05, + "loss": 0.593, + "step": 860 + }, + { + "epoch": 0.64, + "learning_rate": 9.272204147851008e-05, + "loss": 0.1031, + "step": 861 + }, + { + "epoch": 0.64, + "learning_rate": 9.238999431512519e-05, + "loss": 0.2781, + "step": 862 + }, + { + "epoch": 0.64, + "learning_rate": 9.20582779719802e-05, + "loss": 0.3534, + "step": 863 + }, + { + "epoch": 0.64, + "learning_rate": 9.172689435392617e-05, + "loss": 0.6448, + "step": 864 + }, + { + "epoch": 0.64, + "learning_rate": 9.139584536390353e-05, + "loss": 0.164, + "step": 865 + }, + { + "epoch": 0.64, + "learning_rate": 9.106513290293113e-05, + "loss": 0.3111, + "step": 866 + }, + { + "epoch": 0.64, + "learning_rate": 9.073475887009532e-05, + "loss": 0.4406, + "step": 867 + }, + { + "epoch": 0.64, + "learning_rate": 9.040472516253901e-05, + "loss": 0.5083, + "step": 868 + }, + { + "epoch": 0.64, + "learning_rate": 9.007503367545094e-05, + "loss": 0.3334, + "step": 869 + }, + { + "epoch": 0.64, + "learning_rate": 8.97456863020546e-05, + "loss": 0.6558, + "step": 870 + }, + { + "epoch": 0.64, + "learning_rate": 8.941668493359741e-05, + "loss": 0.3639, + "step": 871 + }, + { + "epoch": 0.64, + "learning_rate": 8.908803145934003e-05, + "loss": 0.5976, + "step": 872 + }, + { + "epoch": 0.65, + "learning_rate": 8.875972776654517e-05, + "loss": 0.5383, + "step": 873 + }, + { + "epoch": 0.65, + "learning_rate": 8.843177574046708e-05, + "loss": 0.3301, + "step": 874 + }, + { + "epoch": 0.65, + "learning_rate": 8.810417726434054e-05, + "loss": 0.3455, + "step": 875 + }, + { + "epoch": 0.65, + "learning_rate": 8.777693421937022e-05, + "loss": 0.296, + "step": 876 + }, + { + "epoch": 0.65, + "learning_rate": 8.745004848471957e-05, + "loss": 0.2113, + "step": 877 + }, + { + "epoch": 0.65, + "learning_rate": 8.712352193750043e-05, + "loss": 0.1236, + "step": 878 + }, + { + "epoch": 0.65, + "learning_rate": 8.679735645276172e-05, + "loss": 0.3651, + "step": 879 + }, + { + "epoch": 0.65, + "learning_rate": 8.647155390347942e-05, + "loss": 0.3829, + "step": 880 + }, + { + "epoch": 0.65, + "learning_rate": 8.614611616054502e-05, + "loss": 0.55, + "step": 881 + }, + { + "epoch": 0.65, + "learning_rate": 8.582104509275534e-05, + "loss": 0.2811, + "step": 882 + }, + { + "epoch": 0.65, + "learning_rate": 8.549634256680157e-05, + "loss": 0.2337, + "step": 883 + }, + { + "epoch": 0.65, + "learning_rate": 8.517201044725847e-05, + "loss": 0.2605, + "step": 884 + }, + { + "epoch": 0.65, + "learning_rate": 8.4848050596574e-05, + "loss": 0.5998, + "step": 885 + }, + { + "epoch": 0.66, + "learning_rate": 8.452446487505808e-05, + "loss": 0.3719, + "step": 886 + }, + { + "epoch": 0.66, + "learning_rate": 8.420125514087253e-05, + "loss": 0.3653, + "step": 887 + }, + { + "epoch": 0.66, + "learning_rate": 8.387842325002e-05, + "loss": 0.7566, + "step": 888 + }, + { + "epoch": 0.66, + "learning_rate": 8.355597105633328e-05, + "loss": 0.1803, + "step": 889 + }, + { + "epoch": 0.66, + "learning_rate": 8.323390041146493e-05, + "loss": 0.3518, + "step": 890 + }, + { + "epoch": 0.66, + "learning_rate": 8.291221316487653e-05, + "loss": 0.5731, + "step": 891 + }, + { + "epoch": 0.66, + "learning_rate": 8.259091116382784e-05, + "loss": 0.686, + "step": 892 + }, + { + "epoch": 0.66, + "learning_rate": 8.226999625336662e-05, + "loss": 0.1914, + "step": 893 + }, + { + "epoch": 0.66, + "learning_rate": 8.194947027631756e-05, + "loss": 0.4432, + "step": 894 + }, + { + "epoch": 0.66, + "learning_rate": 8.162933507327212e-05, + "loss": 0.3462, + "step": 895 + }, + { + "epoch": 0.66, + "learning_rate": 8.130959248257772e-05, + "loss": 0.2411, + "step": 896 + }, + { + "epoch": 0.66, + "learning_rate": 8.099024434032717e-05, + "loss": 0.7422, + "step": 897 + }, + { + "epoch": 0.66, + "learning_rate": 8.067129248034832e-05, + "loss": 0.5798, + "step": 898 + }, + { + "epoch": 0.66, + "learning_rate": 8.035273873419333e-05, + "loss": 0.2258, + "step": 899 + }, + { + "epoch": 0.67, + "learning_rate": 8.003458493112816e-05, + "loss": 0.4932, + "step": 900 + }, + { + "epoch": 0.67, + "eval_loss": 0.6635782122612, + "eval_runtime": 13.0807, + "eval_samples_per_second": 0.841, + "eval_steps_per_second": 0.841, + "step": 900 + }, + { + "epoch": 0.67, + "learning_rate": 7.971683289812229e-05, + "loss": 0.4165, + "step": 901 + }, + { + "epoch": 0.67, + "learning_rate": 7.939948445983799e-05, + "loss": 0.3897, + "step": 902 + }, + { + "epoch": 0.67, + "learning_rate": 7.908254143861986e-05, + "loss": 0.5963, + "step": 903 + }, + { + "epoch": 0.67, + "learning_rate": 7.876600565448457e-05, + "loss": 0.5027, + "step": 904 + }, + { + "epoch": 0.67, + "learning_rate": 7.844987892511011e-05, + "loss": 0.287, + "step": 905 + }, + { + "epoch": 0.67, + "learning_rate": 7.813416306582565e-05, + "loss": 0.7406, + "step": 906 + }, + { + "epoch": 0.67, + "learning_rate": 7.781885988960096e-05, + "loss": 0.3224, + "step": 907 + }, + { + "epoch": 0.67, + "learning_rate": 7.750397120703583e-05, + "loss": 0.3747, + "step": 908 + }, + { + "epoch": 0.67, + "learning_rate": 7.718949882635011e-05, + "loss": 1.083, + "step": 909 + }, + { + "epoch": 0.67, + "learning_rate": 7.6875444553373e-05, + "loss": 0.4304, + "step": 910 + }, + { + "epoch": 0.67, + "learning_rate": 7.656181019153258e-05, + "loss": 0.6364, + "step": 911 + }, + { + "epoch": 0.67, + "learning_rate": 7.624859754184588e-05, + "loss": 0.4104, + "step": 912 + }, + { + "epoch": 0.67, + "learning_rate": 7.593580840290818e-05, + "loss": 0.3649, + "step": 913 + }, + { + "epoch": 0.68, + "learning_rate": 7.562344457088269e-05, + "loss": 1.0562, + "step": 914 + }, + { + "epoch": 0.68, + "learning_rate": 7.531150783949052e-05, + "loss": 0.3616, + "step": 915 + }, + { + "epoch": 0.68, + "learning_rate": 7.500000000000002e-05, + "loss": 0.2649, + "step": 916 + }, + { + "epoch": 0.68, + "learning_rate": 7.468892284121677e-05, + "loss": 0.5163, + "step": 917 + }, + { + "epoch": 0.68, + "learning_rate": 7.437827814947323e-05, + "loss": 0.2815, + "step": 918 + }, + { + "epoch": 0.68, + "learning_rate": 7.406806770861833e-05, + "loss": 0.4637, + "step": 919 + }, + { + "epoch": 0.68, + "learning_rate": 7.375829330000747e-05, + "loss": 0.1545, + "step": 920 + }, + { + "epoch": 0.68, + "learning_rate": 7.344895670249218e-05, + "loss": 0.4964, + "step": 921 + }, + { + "epoch": 0.68, + "learning_rate": 7.314005969240974e-05, + "loss": 0.2742, + "step": 922 + }, + { + "epoch": 0.68, + "learning_rate": 7.283160404357333e-05, + "loss": 0.7472, + "step": 923 + }, + { + "epoch": 0.68, + "learning_rate": 7.252359152726156e-05, + "loss": 0.4042, + "step": 924 + }, + { + "epoch": 0.68, + "learning_rate": 7.221602391220844e-05, + "loss": 0.324, + "step": 925 + }, + { + "epoch": 0.68, + "learning_rate": 7.190890296459316e-05, + "loss": 0.616, + "step": 926 + }, + { + "epoch": 0.69, + "learning_rate": 7.160223044802984e-05, + "loss": 0.3656, + "step": 927 + }, + { + "epoch": 0.69, + "learning_rate": 7.129600812355775e-05, + "loss": 0.6666, + "step": 928 + }, + { + "epoch": 0.69, + "learning_rate": 7.099023774963076e-05, + "loss": 0.1535, + "step": 929 + }, + { + "epoch": 0.69, + "learning_rate": 7.068492108210756e-05, + "loss": 0.2176, + "step": 930 + }, + { + "epoch": 0.69, + "learning_rate": 7.038005987424158e-05, + "loss": 0.5321, + "step": 931 + }, + { + "epoch": 0.69, + "learning_rate": 7.007565587667053e-05, + "loss": 0.1454, + "step": 932 + }, + { + "epoch": 0.69, + "learning_rate": 6.977171083740686e-05, + "loss": 0.2402, + "step": 933 + }, + { + "epoch": 0.69, + "learning_rate": 6.946822650182751e-05, + "loss": 0.6484, + "step": 934 + }, + { + "epoch": 0.69, + "learning_rate": 6.916520461266366e-05, + "loss": 0.442, + "step": 935 + }, + { + "epoch": 0.69, + "learning_rate": 6.886264690999114e-05, + "loss": 0.1063, + "step": 936 + }, + { + "epoch": 0.69, + "learning_rate": 6.856055513122017e-05, + "loss": 0.454, + "step": 937 + }, + { + "epoch": 0.69, + "learning_rate": 6.825893101108536e-05, + "loss": 0.2908, + "step": 938 + }, + { + "epoch": 0.69, + "learning_rate": 6.795777628163599e-05, + "loss": 0.2057, + "step": 939 + }, + { + "epoch": 0.69, + "learning_rate": 6.765709267222571e-05, + "loss": 0.5314, + "step": 940 + }, + { + "epoch": 0.7, + "learning_rate": 6.735688190950297e-05, + "loss": 0.5713, + "step": 941 + }, + { + "epoch": 0.7, + "learning_rate": 6.705714571740096e-05, + "loss": 0.5988, + "step": 942 + }, + { + "epoch": 0.7, + "learning_rate": 6.675788581712747e-05, + "loss": 0.2251, + "step": 943 + }, + { + "epoch": 0.7, + "learning_rate": 6.645910392715544e-05, + "loss": 0.3555, + "step": 944 + }, + { + "epoch": 0.7, + "learning_rate": 6.616080176321291e-05, + "loss": 0.5266, + "step": 945 + }, + { + "epoch": 0.7, + "learning_rate": 6.586298103827286e-05, + "loss": 0.235, + "step": 946 + }, + { + "epoch": 0.7, + "learning_rate": 6.556564346254394e-05, + "loss": 0.158, + "step": 947 + }, + { + "epoch": 0.7, + "learning_rate": 6.526879074346027e-05, + "loss": 0.5409, + "step": 948 + }, + { + "epoch": 0.7, + "learning_rate": 6.49724245856716e-05, + "loss": 0.3644, + "step": 949 + }, + { + "epoch": 0.7, + "learning_rate": 6.467654669103386e-05, + "loss": 0.447, + "step": 950 + }, + { + "epoch": 0.7, + "learning_rate": 6.438115875859898e-05, + "loss": 0.3026, + "step": 951 + }, + { + "epoch": 0.7, + "learning_rate": 6.408626248460542e-05, + "loss": 0.2897, + "step": 952 + }, + { + "epoch": 0.7, + "learning_rate": 6.37918595624684e-05, + "loss": 0.311, + "step": 953 + }, + { + "epoch": 0.71, + "learning_rate": 6.349795168276994e-05, + "loss": 0.5142, + "step": 954 + }, + { + "epoch": 0.71, + "learning_rate": 6.320454053324945e-05, + "loss": 0.3723, + "step": 955 + }, + { + "epoch": 0.71, + "learning_rate": 6.291162779879396e-05, + "loss": 0.2556, + "step": 956 + }, + { + "epoch": 0.71, + "learning_rate": 6.261921516142819e-05, + "loss": 0.6081, + "step": 957 + }, + { + "epoch": 0.71, + "learning_rate": 6.232730430030535e-05, + "loss": 0.5688, + "step": 958 + }, + { + "epoch": 0.71, + "learning_rate": 6.2035896891697e-05, + "loss": 0.5, + "step": 959 + }, + { + "epoch": 0.71, + "learning_rate": 6.174499460898383e-05, + "loss": 0.5152, + "step": 960 + }, + { + "epoch": 0.71, + "learning_rate": 6.145459912264596e-05, + "loss": 0.3877, + "step": 961 + }, + { + "epoch": 0.71, + "learning_rate": 6.1164712100253e-05, + "loss": 0.6429, + "step": 962 + }, + { + "epoch": 0.71, + "learning_rate": 6.0875335206455044e-05, + "loss": 0.2803, + "step": 963 + }, + { + "epoch": 0.71, + "learning_rate": 6.0586470102972675e-05, + "loss": 0.6748, + "step": 964 + }, + { + "epoch": 0.71, + "learning_rate": 6.0298118448587524e-05, + "loss": 0.4075, + "step": 965 + }, + { + "epoch": 0.71, + "learning_rate": 6.0010281899132885e-05, + "loss": 0.4328, + "step": 966 + }, + { + "epoch": 0.71, + "learning_rate": 5.972296210748414e-05, + "loss": 0.2727, + "step": 967 + }, + { + "epoch": 0.72, + "learning_rate": 5.943616072354907e-05, + "loss": 0.5985, + "step": 968 + }, + { + "epoch": 0.72, + "learning_rate": 5.914987939425868e-05, + "loss": 0.2922, + "step": 969 + }, + { + "epoch": 0.72, + "learning_rate": 5.886411976355757e-05, + "loss": 0.1786, + "step": 970 + }, + { + "epoch": 0.72, + "learning_rate": 5.857888347239457e-05, + "loss": 0.4119, + "step": 971 + }, + { + "epoch": 0.72, + "learning_rate": 5.8294172158713285e-05, + "loss": 0.3754, + "step": 972 + }, + { + "epoch": 0.72, + "learning_rate": 5.800998745744253e-05, + "loss": 0.4393, + "step": 973 + }, + { + "epoch": 0.72, + "learning_rate": 5.7726331000487343e-05, + "loss": 0.1773, + "step": 974 + }, + { + "epoch": 0.72, + "learning_rate": 5.74432044167191e-05, + "loss": 0.3433, + "step": 975 + }, + { + "epoch": 0.72, + "learning_rate": 5.7160609331966646e-05, + "loss": 0.4045, + "step": 976 + }, + { + "epoch": 0.72, + "learning_rate": 5.68785473690067e-05, + "loss": 0.5428, + "step": 977 + }, + { + "epoch": 0.72, + "learning_rate": 5.659702014755443e-05, + "loss": 0.2591, + "step": 978 + }, + { + "epoch": 0.72, + "learning_rate": 5.631602928425448e-05, + "loss": 0.4271, + "step": 979 + }, + { + "epoch": 0.72, + "learning_rate": 5.603557639267146e-05, + "loss": 0.4947, + "step": 980 + }, + { + "epoch": 0.73, + "learning_rate": 5.5755663083280616e-05, + "loss": 0.6009, + "step": 981 + }, + { + "epoch": 0.73, + "learning_rate": 5.547629096345886e-05, + "loss": 0.5386, + "step": 982 + }, + { + "epoch": 0.73, + "learning_rate": 5.51974616374752e-05, + "loss": 0.8077, + "step": 983 + }, + { + "epoch": 0.73, + "learning_rate": 5.491917670648183e-05, + "loss": 0.5592, + "step": 984 + }, + { + "epoch": 0.73, + "learning_rate": 5.4641437768504824e-05, + "loss": 0.2332, + "step": 985 + }, + { + "epoch": 0.73, + "learning_rate": 5.436424641843475e-05, + "loss": 0.311, + "step": 986 + }, + { + "epoch": 0.73, + "learning_rate": 5.408760424801792e-05, + "loss": 0.4153, + "step": 987 + }, + { + "epoch": 0.73, + "learning_rate": 5.381151284584696e-05, + "loss": 0.3122, + "step": 988 + }, + { + "epoch": 0.73, + "learning_rate": 5.353597379735165e-05, + "loss": 0.9233, + "step": 989 + }, + { + "epoch": 0.73, + "learning_rate": 5.326098868479008e-05, + "loss": 0.2592, + "step": 990 + }, + { + "epoch": 0.73, + "learning_rate": 5.29865590872394e-05, + "loss": 0.3064, + "step": 991 + }, + { + "epoch": 0.73, + "learning_rate": 5.27126865805866e-05, + "loss": 0.2996, + "step": 992 + }, + { + "epoch": 0.73, + "learning_rate": 5.2439372737519876e-05, + "loss": 0.9199, + "step": 993 + }, + { + "epoch": 0.73, + "learning_rate": 5.2166619127519104e-05, + "loss": 0.2899, + "step": 994 + }, + { + "epoch": 0.74, + "learning_rate": 5.1894427316847226e-05, + "loss": 0.5593, + "step": 995 + }, + { + "epoch": 0.74, + "learning_rate": 5.1622798868541124e-05, + "loss": 0.3174, + "step": 996 + }, + { + "epoch": 0.74, + "learning_rate": 5.135173534240246e-05, + "loss": 0.4202, + "step": 997 + }, + { + "epoch": 0.74, + "learning_rate": 5.108123829498903e-05, + "loss": 0.417, + "step": 998 + }, + { + "epoch": 0.74, + "learning_rate": 5.0811309279605675e-05, + "loss": 0.2984, + "step": 999 + }, + { + "epoch": 0.74, + "learning_rate": 5.0541949846295234e-05, + "loss": 0.3554, + "step": 1000 + }, + { + "epoch": 0.74, + "eval_loss": 0.6549306511878967, + "eval_runtime": 12.9982, + "eval_samples_per_second": 0.846, + "eval_steps_per_second": 0.846, + "step": 1000 + } + ], + "logging_steps": 1, + "max_steps": 1352, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 2.8926123468907315e+17, + "trial_name": null, + "trial_params": null +}