{ "best_metric": 1.7484028447256605e-08, "best_model_checkpoint": "miner_id_24/checkpoint-400", "epoch": 1.339030892269092, "eval_steps": 25, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0033453557053996132, "grad_norm": 34.97755813598633, "learning_rate": 5.7142857142857145e-06, "loss": 8.3788, "step": 1 }, { "epoch": 0.0033453557053996132, "eval_loss": 8.701708793640137, "eval_runtime": 0.6981, "eval_samples_per_second": 71.624, "eval_steps_per_second": 18.622, "step": 1 }, { "epoch": 0.0066907114107992265, "grad_norm": 34.05320358276367, "learning_rate": 1.1428571428571429e-05, "loss": 8.4134, "step": 2 }, { "epoch": 0.01003606711619884, "grad_norm": 33.928340911865234, "learning_rate": 1.7142857142857145e-05, "loss": 8.5945, "step": 3 }, { "epoch": 0.013381422821598453, "grad_norm": 33.57594680786133, "learning_rate": 2.2857142857142858e-05, "loss": 8.5789, "step": 4 }, { "epoch": 0.016726778526998064, "grad_norm": 33.87229537963867, "learning_rate": 2.857142857142857e-05, "loss": 8.0631, "step": 5 }, { "epoch": 0.02007213423239768, "grad_norm": 32.26136016845703, "learning_rate": 3.428571428571429e-05, "loss": 7.0605, "step": 6 }, { "epoch": 0.02341748993779729, "grad_norm": 30.542055130004883, "learning_rate": 4e-05, "loss": 5.8203, "step": 7 }, { "epoch": 0.026762845643196906, "grad_norm": 25.88512420654297, "learning_rate": 4.5714285714285716e-05, "loss": 4.4277, "step": 8 }, { "epoch": 0.030108201348596517, "grad_norm": 23.75008201599121, "learning_rate": 5.142857142857143e-05, "loss": 3.2447, "step": 9 }, { "epoch": 0.03345355705399613, "grad_norm": 22.926939010620117, "learning_rate": 5.714285714285714e-05, "loss": 2.2102, "step": 10 }, { "epoch": 0.036798912759395744, "grad_norm": 20.52219581604004, "learning_rate": 6.285714285714286e-05, "loss": 1.2021, "step": 11 }, { "epoch": 0.04014426846479536, "grad_norm": 16.353342056274414, "learning_rate": 6.857142857142858e-05, "loss": 0.4418, "step": 12 }, { "epoch": 0.043489624170194974, "grad_norm": 3.0186972618103027, "learning_rate": 7.428571428571429e-05, "loss": 0.0675, "step": 13 }, { "epoch": 0.04683497987559458, "grad_norm": 0.8751585483551025, "learning_rate": 8e-05, "loss": 0.0163, "step": 14 }, { "epoch": 0.0501803355809942, "grad_norm": 0.1573815792798996, "learning_rate": 8.571428571428571e-05, "loss": 0.0023, "step": 15 }, { "epoch": 0.05352569128639381, "grad_norm": 0.026311950758099556, "learning_rate": 9.142857142857143e-05, "loss": 0.0003, "step": 16 }, { "epoch": 0.05687104699179343, "grad_norm": 0.005918859504163265, "learning_rate": 9.714285714285715e-05, "loss": 0.0001, "step": 17 }, { "epoch": 0.060216402697193035, "grad_norm": 0.0018804478459060192, "learning_rate": 0.00010285714285714286, "loss": 0.0, "step": 18 }, { "epoch": 0.06356175840259265, "grad_norm": 0.0009282229002565145, "learning_rate": 0.00010857142857142856, "loss": 0.0, "step": 19 }, { "epoch": 0.06690711410799226, "grad_norm": 0.0005526078166440129, "learning_rate": 0.00011428571428571428, "loss": 0.0, "step": 20 }, { "epoch": 0.07025246981339188, "grad_norm": 0.0004224847652949393, "learning_rate": 0.00012, "loss": 0.0, "step": 21 }, { "epoch": 0.07359782551879149, "grad_norm": 0.0002950274501927197, "learning_rate": 0.00012571428571428572, "loss": 0.0, "step": 22 }, { "epoch": 0.07694318122419111, "grad_norm": 0.000257886596955359, "learning_rate": 0.00013142857142857143, "loss": 0.0, "step": 23 }, { "epoch": 0.08028853692959072, "grad_norm": 0.00028627616120502353, "learning_rate": 0.00013714285714285716, "loss": 0.0, "step": 24 }, { "epoch": 0.08363389263499033, "grad_norm": 0.0014158413978293538, "learning_rate": 0.00014285714285714287, "loss": 0.0, "step": 25 }, { "epoch": 0.08363389263499033, "eval_loss": 8.41735800349852e-06, "eval_runtime": 0.6963, "eval_samples_per_second": 71.805, "eval_steps_per_second": 18.669, "step": 25 }, { "epoch": 0.08697924834038995, "grad_norm": 0.0031467110384255648, "learning_rate": 0.00014857142857142857, "loss": 0.0, "step": 26 }, { "epoch": 0.09032460404578956, "grad_norm": 0.013348601758480072, "learning_rate": 0.0001542857142857143, "loss": 0.0, "step": 27 }, { "epoch": 0.09366995975118916, "grad_norm": 0.01236347109079361, "learning_rate": 0.00016, "loss": 0.0, "step": 28 }, { "epoch": 0.09701531545658879, "grad_norm": 0.00939163751900196, "learning_rate": 0.00016571428571428575, "loss": 0.0, "step": 29 }, { "epoch": 0.1003606711619884, "grad_norm": 0.0040636854246258736, "learning_rate": 0.00017142857142857143, "loss": 0.0, "step": 30 }, { "epoch": 0.10370602686738802, "grad_norm": 0.0012760682730004191, "learning_rate": 0.00017714285714285713, "loss": 0.0, "step": 31 }, { "epoch": 0.10705138257278762, "grad_norm": 0.0007899609045125544, "learning_rate": 0.00018285714285714286, "loss": 0.0, "step": 32 }, { "epoch": 0.11039673827818723, "grad_norm": 0.00033409736352041364, "learning_rate": 0.00018857142857142857, "loss": 0.0, "step": 33 }, { "epoch": 0.11374209398358685, "grad_norm": 0.0002635143755469471, "learning_rate": 0.0001942857142857143, "loss": 0.0, "step": 34 }, { "epoch": 0.11708744968898646, "grad_norm": 0.00017794633458834141, "learning_rate": 0.0002, "loss": 0.0, "step": 35 }, { "epoch": 0.12043280539438607, "grad_norm": 0.00012796746159438044, "learning_rate": 0.00019999940228074457, "loss": 0.0, "step": 36 }, { "epoch": 0.12377816109978569, "grad_norm": 0.00011626619379967451, "learning_rate": 0.00019999760913091752, "loss": 0.0, "step": 37 }, { "epoch": 0.1271235168051853, "grad_norm": 0.00012612015416380018, "learning_rate": 0.00019999462057433665, "loss": 0.0, "step": 38 }, { "epoch": 0.13046887251058492, "grad_norm": 9.349120227852836e-05, "learning_rate": 0.00019999043665069795, "loss": 0.0, "step": 39 }, { "epoch": 0.13381422821598452, "grad_norm": 9.639420022722334e-05, "learning_rate": 0.00019998505741557492, "loss": 0.0, "step": 40 }, { "epoch": 0.13715958392138414, "grad_norm": 7.635910878889263e-05, "learning_rate": 0.00019997848294041817, "loss": 0.0, "step": 41 }, { "epoch": 0.14050493962678376, "grad_norm": 8.218162838602439e-05, "learning_rate": 0.00019997071331255407, "loss": 0.0, "step": 42 }, { "epoch": 0.14385029533218338, "grad_norm": 8.505357982357964e-05, "learning_rate": 0.00019996174863518392, "loss": 0.0, "step": 43 }, { "epoch": 0.14719565103758298, "grad_norm": 7.957301568239927e-05, "learning_rate": 0.00019995158902738237, "loss": 0.0, "step": 44 }, { "epoch": 0.1505410067429826, "grad_norm": 9.491829405305907e-05, "learning_rate": 0.0001999402346240959, "loss": 0.0, "step": 45 }, { "epoch": 0.15388636244838222, "grad_norm": 0.00010060270869871601, "learning_rate": 0.00019992768557614113, "loss": 0.0, "step": 46 }, { "epoch": 0.1572317181537818, "grad_norm": 9.678181231720373e-05, "learning_rate": 0.00019991394205020268, "loss": 0.0, "step": 47 }, { "epoch": 0.16057707385918144, "grad_norm": 8.876763604348525e-05, "learning_rate": 0.00019989900422883095, "loss": 0.0, "step": 48 }, { "epoch": 0.16392242956458106, "grad_norm": 9.82375568128191e-05, "learning_rate": 0.00019988287231043983, "loss": 0.0, "step": 49 }, { "epoch": 0.16726778526998065, "grad_norm": 0.00013796836719848216, "learning_rate": 0.00019986554650930396, "loss": 0.0, "step": 50 }, { "epoch": 0.16726778526998065, "eval_loss": 5.390903652369161e-07, "eval_runtime": 0.6972, "eval_samples_per_second": 71.711, "eval_steps_per_second": 18.645, "step": 50 }, { "epoch": 0.17061314097538027, "grad_norm": 0.00011019642261089757, "learning_rate": 0.00019984702705555583, "loss": 0.0, "step": 51 }, { "epoch": 0.1739584966807799, "grad_norm": 0.0001130071614170447, "learning_rate": 0.00019982731419518292, "loss": 0.0, "step": 52 }, { "epoch": 0.1773038523861795, "grad_norm": 0.00011905086284969002, "learning_rate": 0.00019980640819002425, "loss": 0.0, "step": 53 }, { "epoch": 0.1806492080915791, "grad_norm": 0.0001293181994697079, "learning_rate": 0.00019978430931776695, "loss": 0.0, "step": 54 }, { "epoch": 0.18399456379697873, "grad_norm": 0.00012432319635991007, "learning_rate": 0.00019976101787194266, "loss": 0.0, "step": 55 }, { "epoch": 0.18733991950237833, "grad_norm": 0.00011901009565917775, "learning_rate": 0.0001997365341619234, "loss": 0.0, "step": 56 }, { "epoch": 0.19068527520777795, "grad_norm": 0.00013753857638221234, "learning_rate": 0.00019971085851291787, "loss": 0.0, "step": 57 }, { "epoch": 0.19403063091317757, "grad_norm": 0.00013435599976219237, "learning_rate": 0.0001996839912659666, "loss": 0.0, "step": 58 }, { "epoch": 0.19737598661857716, "grad_norm": 0.00014754076255485415, "learning_rate": 0.0001996559327779379, "loss": 0.0, "step": 59 }, { "epoch": 0.2007213423239768, "grad_norm": 0.00015449147031176835, "learning_rate": 0.00019962668342152285, "loss": 0.0, "step": 60 }, { "epoch": 0.2040666980293764, "grad_norm": 0.0001306010817643255, "learning_rate": 0.0001995962435852304, "loss": 0.0, "step": 61 }, { "epoch": 0.20741205373477603, "grad_norm": 0.0001918449706863612, "learning_rate": 0.00019956461367338221, "loss": 0.0, "step": 62 }, { "epoch": 0.21075740944017562, "grad_norm": 0.00017537118401378393, "learning_rate": 0.0001995317941061074, "loss": 0.0, "step": 63 }, { "epoch": 0.21410276514557525, "grad_norm": 0.00011113757500424981, "learning_rate": 0.00019949778531933677, "loss": 0.0, "step": 64 }, { "epoch": 0.21744812085097487, "grad_norm": 0.00010741417645476758, "learning_rate": 0.00019946258776479711, "loss": 0.0, "step": 65 }, { "epoch": 0.22079347655637446, "grad_norm": 0.0001034390734275803, "learning_rate": 0.00019942620191000526, "loss": 0.0, "step": 66 }, { "epoch": 0.22413883226177408, "grad_norm": 9.823393338592723e-05, "learning_rate": 0.0001993886282382618, "loss": 0.0, "step": 67 }, { "epoch": 0.2274841879671737, "grad_norm": 7.32116459403187e-05, "learning_rate": 0.00019934986724864468, "loss": 0.0, "step": 68 }, { "epoch": 0.2308295436725733, "grad_norm": 6.675547774648294e-05, "learning_rate": 0.0001993099194560025, "loss": 0.0, "step": 69 }, { "epoch": 0.23417489937797292, "grad_norm": 7.420439214911312e-05, "learning_rate": 0.0001992687853909478, "loss": 0.0, "step": 70 }, { "epoch": 0.23752025508337254, "grad_norm": 6.421384023269638e-05, "learning_rate": 0.0001992264655998501, "loss": 0.0, "step": 71 }, { "epoch": 0.24086561078877214, "grad_norm": 6.552231207024306e-05, "learning_rate": 0.00019918296064482823, "loss": 0.0, "step": 72 }, { "epoch": 0.24421096649417176, "grad_norm": 5.69834592170082e-05, "learning_rate": 0.00019913827110374345, "loss": 0.0, "step": 73 }, { "epoch": 0.24755632219957138, "grad_norm": 6.455439142882824e-05, "learning_rate": 0.00019909239757019112, "loss": 0.0, "step": 74 }, { "epoch": 0.250901677904971, "grad_norm": 8.113776857499033e-05, "learning_rate": 0.00019904534065349344, "loss": 0.0, "step": 75 }, { "epoch": 0.250901677904971, "eval_loss": 2.551077784573863e-07, "eval_runtime": 0.6974, "eval_samples_per_second": 71.695, "eval_steps_per_second": 18.641, "step": 75 }, { "epoch": 0.2542470336103706, "grad_norm": 5.313381188898347e-05, "learning_rate": 0.00019899710097869095, "loss": 0.0, "step": 76 }, { "epoch": 0.2575923893157702, "grad_norm": 4.714027090813033e-05, "learning_rate": 0.0001989476791865344, "loss": 0.0, "step": 77 }, { "epoch": 0.26093774502116984, "grad_norm": 4.053263910464011e-05, "learning_rate": 0.00019889707593347613, "loss": 0.0, "step": 78 }, { "epoch": 0.26428310072656946, "grad_norm": 4.4172094931127504e-05, "learning_rate": 0.00019884529189166143, "loss": 0.0, "step": 79 }, { "epoch": 0.26762845643196903, "grad_norm": 3.566274244803935e-05, "learning_rate": 0.00019879232774891966, "loss": 0.0, "step": 80 }, { "epoch": 0.27097381213736865, "grad_norm": 3.396796091692522e-05, "learning_rate": 0.00019873818420875495, "loss": 0.0, "step": 81 }, { "epoch": 0.2743191678427683, "grad_norm": 3.43717765645124e-05, "learning_rate": 0.00019868286199033698, "loss": 0.0, "step": 82 }, { "epoch": 0.2776645235481679, "grad_norm": 3.289350570412353e-05, "learning_rate": 0.00019862636182849152, "loss": 0.0, "step": 83 }, { "epoch": 0.2810098792535675, "grad_norm": 3.4276279620826244e-05, "learning_rate": 0.0001985686844736904, "loss": 0.0, "step": 84 }, { "epoch": 0.28435523495896714, "grad_norm": 3.2609026675345376e-05, "learning_rate": 0.00019850983069204166, "loss": 0.0, "step": 85 }, { "epoch": 0.28770059066436676, "grad_norm": 2.833784674294293e-05, "learning_rate": 0.00019844980126527964, "loss": 0.0, "step": 86 }, { "epoch": 0.29104594636976633, "grad_norm": 3.57207354682032e-05, "learning_rate": 0.00019838859699075408, "loss": 0.0, "step": 87 }, { "epoch": 0.29439130207516595, "grad_norm": 3.2909458241192624e-05, "learning_rate": 0.00019832621868142006, "loss": 0.0, "step": 88 }, { "epoch": 0.2977366577805656, "grad_norm": 2.729486550379079e-05, "learning_rate": 0.0001982626671658267, "loss": 0.0, "step": 89 }, { "epoch": 0.3010820134859652, "grad_norm": 2.5987241315306164e-05, "learning_rate": 0.0001981979432881067, "loss": 0.0, "step": 90 }, { "epoch": 0.3044273691913648, "grad_norm": 2.5494544388493523e-05, "learning_rate": 0.0001981320479079646, "loss": 0.0, "step": 91 }, { "epoch": 0.30777272489676444, "grad_norm": 2.423186924715992e-05, "learning_rate": 0.00019806498190066564, "loss": 0.0, "step": 92 }, { "epoch": 0.311118080602164, "grad_norm": 2.174468318116851e-05, "learning_rate": 0.00019799674615702425, "loss": 0.0, "step": 93 }, { "epoch": 0.3144634363075636, "grad_norm": 2.084146035485901e-05, "learning_rate": 0.00019792734158339178, "loss": 0.0, "step": 94 }, { "epoch": 0.31780879201296325, "grad_norm": 2.2315543901640922e-05, "learning_rate": 0.00019785676910164503, "loss": 0.0, "step": 95 }, { "epoch": 0.32115414771836287, "grad_norm": 2.352862611587625e-05, "learning_rate": 0.00019778502964917357, "loss": 0.0, "step": 96 }, { "epoch": 0.3244995034237625, "grad_norm": 2.2483140128315426e-05, "learning_rate": 0.00019771212417886736, "loss": 0.0, "step": 97 }, { "epoch": 0.3278448591291621, "grad_norm": 1.9780287402682006e-05, "learning_rate": 0.00019763805365910432, "loss": 0.0, "step": 98 }, { "epoch": 0.3311902148345617, "grad_norm": 2.0650611986638978e-05, "learning_rate": 0.00019756281907373725, "loss": 0.0, "step": 99 }, { "epoch": 0.3345355705399613, "grad_norm": 2.816585219989065e-05, "learning_rate": 0.00019748642142208083, "loss": 0.0, "step": 100 }, { "epoch": 0.3345355705399613, "eval_loss": 1.2848110486629594e-07, "eval_runtime": 0.6967, "eval_samples_per_second": 71.768, "eval_steps_per_second": 18.66, "step": 100 }, { "epoch": 0.3378809262453609, "grad_norm": 2.172272616007831e-05, "learning_rate": 0.0001974088617188983, "loss": 0.0, "step": 101 }, { "epoch": 0.34122628195076055, "grad_norm": 1.918191264849156e-05, "learning_rate": 0.00019733014099438808, "loss": 0.0, "step": 102 }, { "epoch": 0.34457163765616017, "grad_norm": 1.76628655026434e-05, "learning_rate": 0.00019725026029416998, "loss": 0.0, "step": 103 }, { "epoch": 0.3479169933615598, "grad_norm": 1.906087709357962e-05, "learning_rate": 0.00019716922067927143, "loss": 0.0, "step": 104 }, { "epoch": 0.3512623490669594, "grad_norm": 1.781911305442918e-05, "learning_rate": 0.00019708702322611329, "loss": 0.0, "step": 105 }, { "epoch": 0.354607704772359, "grad_norm": 1.6582851458224468e-05, "learning_rate": 0.00019700366902649556, "loss": 0.0, "step": 106 }, { "epoch": 0.3579530604777586, "grad_norm": 1.7815795217757113e-05, "learning_rate": 0.00019691915918758292, "loss": 0.0, "step": 107 }, { "epoch": 0.3612984161831582, "grad_norm": 1.774485281202942e-05, "learning_rate": 0.00019683349483188995, "loss": 0.0, "step": 108 }, { "epoch": 0.36464377188855784, "grad_norm": 1.802383667381946e-05, "learning_rate": 0.00019674667709726636, "loss": 0.0, "step": 109 }, { "epoch": 0.36798912759395747, "grad_norm": 1.783985862857662e-05, "learning_rate": 0.0001966587071368817, "loss": 0.0, "step": 110 }, { "epoch": 0.3713344832993571, "grad_norm": 1.5682384400861338e-05, "learning_rate": 0.0001965695861192102, "loss": 0.0, "step": 111 }, { "epoch": 0.37467983900475665, "grad_norm": 1.927248013089411e-05, "learning_rate": 0.00019647931522801516, "loss": 0.0, "step": 112 }, { "epoch": 0.3780251947101563, "grad_norm": 2.0171553842374124e-05, "learning_rate": 0.00019638789566233327, "loss": 0.0, "step": 113 }, { "epoch": 0.3813705504155559, "grad_norm": 1.5699502910138108e-05, "learning_rate": 0.00019629532863645857, "loss": 0.0, "step": 114 }, { "epoch": 0.3847159061209555, "grad_norm": 1.578126466483809e-05, "learning_rate": 0.00019620161537992653, "loss": 0.0, "step": 115 }, { "epoch": 0.38806126182635514, "grad_norm": 1.5141530639084522e-05, "learning_rate": 0.00019610675713749748, "loss": 0.0, "step": 116 }, { "epoch": 0.39140661753175476, "grad_norm": 1.5276151316356845e-05, "learning_rate": 0.00019601075516914037, "loss": 0.0, "step": 117 }, { "epoch": 0.39475197323715433, "grad_norm": 1.4150657989375759e-05, "learning_rate": 0.0001959136107500157, "loss": 0.0, "step": 118 }, { "epoch": 0.39809732894255395, "grad_norm": 1.4487253793049604e-05, "learning_rate": 0.00019581532517045876, "loss": 0.0, "step": 119 }, { "epoch": 0.4014426846479536, "grad_norm": 1.532996611786075e-05, "learning_rate": 0.0001957158997359626, "loss": 0.0, "step": 120 }, { "epoch": 0.4047880403533532, "grad_norm": 1.602738120709546e-05, "learning_rate": 0.00019561533576716042, "loss": 0.0, "step": 121 }, { "epoch": 0.4081333960587528, "grad_norm": 1.5380253898911178e-05, "learning_rate": 0.00019551363459980825, "loss": 0.0, "step": 122 }, { "epoch": 0.41147875176415244, "grad_norm": 1.377376975142397e-05, "learning_rate": 0.0001954107975847671, "loss": 0.0, "step": 123 }, { "epoch": 0.41482410746955206, "grad_norm": 1.4542466487910133e-05, "learning_rate": 0.0001953068260879851, "loss": 0.0, "step": 124 }, { "epoch": 0.41816946317495163, "grad_norm": 1.86767902050633e-05, "learning_rate": 0.00019520172149047922, "loss": 0.0, "step": 125 }, { "epoch": 0.41816946317495163, "eval_loss": 9.377796317266984e-08, "eval_runtime": 0.6966, "eval_samples_per_second": 71.778, "eval_steps_per_second": 18.662, "step": 125 }, { "epoch": 0.42151481888035125, "grad_norm": 1.5794998034834862e-05, "learning_rate": 0.00019509548518831707, "loss": 0.0, "step": 126 }, { "epoch": 0.42486017458575087, "grad_norm": 1.375094689137768e-05, "learning_rate": 0.00019498811859259828, "loss": 0.0, "step": 127 }, { "epoch": 0.4282055302911505, "grad_norm": 1.2839677765441593e-05, "learning_rate": 0.0001948796231294358, "loss": 0.0, "step": 128 }, { "epoch": 0.4315508859965501, "grad_norm": 1.3645031685882714e-05, "learning_rate": 0.00019477000023993688, "loss": 0.0, "step": 129 }, { "epoch": 0.43489624170194974, "grad_norm": 1.3053410839347634e-05, "learning_rate": 0.00019465925138018397, "loss": 0.0, "step": 130 }, { "epoch": 0.4382415974073493, "grad_norm": 1.2471873560571112e-05, "learning_rate": 0.0001945473780212155, "loss": 0.0, "step": 131 }, { "epoch": 0.4415869531127489, "grad_norm": 1.3388749721343629e-05, "learning_rate": 0.00019443438164900613, "loss": 0.0, "step": 132 }, { "epoch": 0.44493230881814855, "grad_norm": 1.304280522163026e-05, "learning_rate": 0.00019432026376444713, "loss": 0.0, "step": 133 }, { "epoch": 0.44827766452354817, "grad_norm": 1.368090306641534e-05, "learning_rate": 0.0001942050258833264, "loss": 0.0, "step": 134 }, { "epoch": 0.4516230202289478, "grad_norm": 1.3070153727312572e-05, "learning_rate": 0.00019408866953630848, "loss": 0.0, "step": 135 }, { "epoch": 0.4549683759343474, "grad_norm": 1.2403514119796455e-05, "learning_rate": 0.00019397119626891394, "loss": 0.0, "step": 136 }, { "epoch": 0.45831373163974704, "grad_norm": 1.48671724673477e-05, "learning_rate": 0.0001938526076414991, "loss": 0.0, "step": 137 }, { "epoch": 0.4616590873451466, "grad_norm": 1.4370959434018005e-05, "learning_rate": 0.00019373290522923525, "loss": 0.0, "step": 138 }, { "epoch": 0.4650044430505462, "grad_norm": 1.2077974133717362e-05, "learning_rate": 0.0001936120906220876, "loss": 0.0, "step": 139 }, { "epoch": 0.46834979875594585, "grad_norm": 1.176060141006019e-05, "learning_rate": 0.00019349016542479432, "loss": 0.0, "step": 140 }, { "epoch": 0.47169515446134547, "grad_norm": 1.1930264918191824e-05, "learning_rate": 0.0001933671312568452, "loss": 0.0, "step": 141 }, { "epoch": 0.4750405101667451, "grad_norm": 1.1996191460639238e-05, "learning_rate": 0.00019324298975245997, "loss": 0.0, "step": 142 }, { "epoch": 0.4783858658721447, "grad_norm": 1.1079649993916973e-05, "learning_rate": 0.0001931177425605668, "loss": 0.0, "step": 143 }, { "epoch": 0.4817312215775443, "grad_norm": 1.1510542208270635e-05, "learning_rate": 0.0001929913913447804, "loss": 0.0, "step": 144 }, { "epoch": 0.4850765772829439, "grad_norm": 1.1645475751720369e-05, "learning_rate": 0.00019286393778337966, "loss": 0.0, "step": 145 }, { "epoch": 0.4884219329883435, "grad_norm": 1.2438331395969726e-05, "learning_rate": 0.0001927353835692857, "loss": 0.0, "step": 146 }, { "epoch": 0.49176728869374314, "grad_norm": 1.1748921679100022e-05, "learning_rate": 0.0001926057304100392, "loss": 0.0, "step": 147 }, { "epoch": 0.49511264439914277, "grad_norm": 1.1038876436941791e-05, "learning_rate": 0.00019247498002777764, "loss": 0.0, "step": 148 }, { "epoch": 0.4984580001045424, "grad_norm": 1.1409942999307532e-05, "learning_rate": 0.00019234313415921264, "loss": 0.0, "step": 149 }, { "epoch": 0.501803355809942, "grad_norm": 1.3146785022399854e-05, "learning_rate": 0.0001922101945556067, "loss": 0.0, "step": 150 }, { "epoch": 0.501803355809942, "eval_loss": 7.655884814994351e-08, "eval_runtime": 0.6979, "eval_samples_per_second": 71.647, "eval_steps_per_second": 18.628, "step": 150 }, { "epoch": 0.5051487115153416, "grad_norm": 1.1849874681502115e-05, "learning_rate": 0.00019207616298275016, "loss": 0.0, "step": 151 }, { "epoch": 0.5084940672207412, "grad_norm": 1.0913483492913656e-05, "learning_rate": 0.0001919410412209374, "loss": 0.0, "step": 152 }, { "epoch": 0.5118394229261408, "grad_norm": 1.0119189028046094e-05, "learning_rate": 0.00019180483106494354, "loss": 0.0, "step": 153 }, { "epoch": 0.5151847786315404, "grad_norm": 1.0729244422691409e-05, "learning_rate": 0.00019166753432400046, "loss": 0.0, "step": 154 }, { "epoch": 0.5185301343369401, "grad_norm": 1.064784919435624e-05, "learning_rate": 0.00019152915282177267, "loss": 0.0, "step": 155 }, { "epoch": 0.5218754900423397, "grad_norm": 9.912720088323113e-06, "learning_rate": 0.0001913896883963333, "loss": 0.0, "step": 156 }, { "epoch": 0.5252208457477393, "grad_norm": 1.0703661246225238e-05, "learning_rate": 0.0001912491429001395, "loss": 0.0, "step": 157 }, { "epoch": 0.5285662014531389, "grad_norm": 1.0600273526506498e-05, "learning_rate": 0.00019110751820000795, "loss": 0.0, "step": 158 }, { "epoch": 0.5319115571585386, "grad_norm": 1.1079074283770751e-05, "learning_rate": 0.00019096481617708998, "loss": 0.0, "step": 159 }, { "epoch": 0.5352569128639381, "grad_norm": 1.0442281563882716e-05, "learning_rate": 0.00019082103872684653, "loss": 0.0, "step": 160 }, { "epoch": 0.5386022685693377, "grad_norm": 1.0094959179696161e-05, "learning_rate": 0.00019067618775902328, "loss": 0.0, "step": 161 }, { "epoch": 0.5419476242747373, "grad_norm": 1.1112629181297962e-05, "learning_rate": 0.000190530265197625, "loss": 0.0, "step": 162 }, { "epoch": 0.5452929799801369, "grad_norm": 1.192512809211621e-05, "learning_rate": 0.00019038327298088987, "loss": 0.0, "step": 163 }, { "epoch": 0.5486383356855365, "grad_norm": 9.673723980085924e-06, "learning_rate": 0.00019023521306126419, "loss": 0.0, "step": 164 }, { "epoch": 0.5519836913909362, "grad_norm": 9.96262224361999e-06, "learning_rate": 0.000190086087405376, "loss": 0.0, "step": 165 }, { "epoch": 0.5553290470963358, "grad_norm": 9.270826922147535e-06, "learning_rate": 0.00018993589799400925, "loss": 0.0, "step": 166 }, { "epoch": 0.5586744028017354, "grad_norm": 1.005772537610028e-05, "learning_rate": 0.00018978464682207732, "loss": 0.0, "step": 167 }, { "epoch": 0.562019758507135, "grad_norm": 9.0111334429821e-06, "learning_rate": 0.00018963233589859666, "loss": 0.0, "step": 168 }, { "epoch": 0.5653651142125347, "grad_norm": 9.614282134862151e-06, "learning_rate": 0.0001894789672466599, "loss": 0.0, "step": 169 }, { "epoch": 0.5687104699179343, "grad_norm": 9.66140487435041e-06, "learning_rate": 0.00018932454290340923, "loss": 0.0, "step": 170 }, { "epoch": 0.5720558256233339, "grad_norm": 1.0295873835275415e-05, "learning_rate": 0.00018916906492000922, "loss": 0.0, "step": 171 }, { "epoch": 0.5754011813287335, "grad_norm": 9.303810657002032e-06, "learning_rate": 0.00018901253536161941, "loss": 0.0, "step": 172 }, { "epoch": 0.578746537034133, "grad_norm": 9.303402293880936e-06, "learning_rate": 0.0001888549563073672, "loss": 0.0, "step": 173 }, { "epoch": 0.5820918927395327, "grad_norm": 9.276480341213755e-06, "learning_rate": 0.00018869632985032007, "loss": 0.0, "step": 174 }, { "epoch": 0.5854372484449323, "grad_norm": 1.123367837863043e-05, "learning_rate": 0.00018853665809745774, "loss": 0.0, "step": 175 }, { "epoch": 0.5854372484449323, "eval_loss": 6.516773964904132e-08, "eval_runtime": 0.6966, "eval_samples_per_second": 71.775, "eval_steps_per_second": 18.661, "step": 175 }, { "epoch": 0.5887826041503319, "grad_norm": 9.455653525947127e-06, "learning_rate": 0.00018837594316964423, "loss": 0.0, "step": 176 }, { "epoch": 0.5921279598557315, "grad_norm": 9.022469384944998e-06, "learning_rate": 0.00018821418720159965, "loss": 0.0, "step": 177 }, { "epoch": 0.5954733155611311, "grad_norm": 8.155801879183855e-06, "learning_rate": 0.00018805139234187202, "loss": 0.0, "step": 178 }, { "epoch": 0.5988186712665308, "grad_norm": 9.403297553944867e-06, "learning_rate": 0.00018788756075280842, "loss": 0.0, "step": 179 }, { "epoch": 0.6021640269719304, "grad_norm": 8.975248420028947e-06, "learning_rate": 0.00018772269461052657, "loss": 0.0, "step": 180 }, { "epoch": 0.60550938267733, "grad_norm": 8.154027455020696e-06, "learning_rate": 0.00018755679610488573, "loss": 0.0, "step": 181 }, { "epoch": 0.6088547383827296, "grad_norm": 8.822008567221928e-06, "learning_rate": 0.0001873898674394577, "loss": 0.0, "step": 182 }, { "epoch": 0.6122000940881293, "grad_norm": 9.123429663304705e-06, "learning_rate": 0.0001872219108314976, "loss": 0.0, "step": 183 }, { "epoch": 0.6155454497935289, "grad_norm": 8.927063390729018e-06, "learning_rate": 0.00018705292851191424, "loss": 0.0, "step": 184 }, { "epoch": 0.6188908054989284, "grad_norm": 8.723560313228518e-06, "learning_rate": 0.00018688292272524067, "loss": 0.0, "step": 185 }, { "epoch": 0.622236161204328, "grad_norm": 8.473615707771387e-06, "learning_rate": 0.00018671189572960423, "loss": 0.0, "step": 186 }, { "epoch": 0.6255815169097276, "grad_norm": 9.322277946921531e-06, "learning_rate": 0.00018653984979669669, "loss": 0.0, "step": 187 }, { "epoch": 0.6289268726151273, "grad_norm": 9.733713341120165e-06, "learning_rate": 0.00018636678721174402, "loss": 0.0, "step": 188 }, { "epoch": 0.6322722283205269, "grad_norm": 8.013458682398777e-06, "learning_rate": 0.00018619271027347592, "loss": 0.0, "step": 189 }, { "epoch": 0.6356175840259265, "grad_norm": 8.55548023537267e-06, "learning_rate": 0.00018601762129409545, "loss": 0.0, "step": 190 }, { "epoch": 0.6389629397313261, "grad_norm": 8.246594006777741e-06, "learning_rate": 0.00018584152259924834, "loss": 0.0, "step": 191 }, { "epoch": 0.6423082954367257, "grad_norm": 8.456608156848233e-06, "learning_rate": 0.00018566441652799188, "loss": 0.0, "step": 192 }, { "epoch": 0.6456536511421254, "grad_norm": 7.874112270656042e-06, "learning_rate": 0.00018548630543276407, "loss": 0.0, "step": 193 }, { "epoch": 0.648999006847525, "grad_norm": 7.750689292151947e-06, "learning_rate": 0.00018530719167935227, "loss": 0.0, "step": 194 }, { "epoch": 0.6523443625529246, "grad_norm": 7.881866622483358e-06, "learning_rate": 0.0001851270776468618, "loss": 0.0, "step": 195 }, { "epoch": 0.6556897182583242, "grad_norm": 8.531223102181684e-06, "learning_rate": 0.00018494596572768432, "loss": 0.0, "step": 196 }, { "epoch": 0.6590350739637238, "grad_norm": 8.375371180591173e-06, "learning_rate": 0.0001847638583274661, "loss": 0.0, "step": 197 }, { "epoch": 0.6623804296691234, "grad_norm": 7.963157258927822e-06, "learning_rate": 0.00018458075786507606, "loss": 0.0, "step": 198 }, { "epoch": 0.665725785374523, "grad_norm": 7.747204108454753e-06, "learning_rate": 0.00018439666677257346, "loss": 0.0, "step": 199 }, { "epoch": 0.6690711410799226, "grad_norm": 9.520526873529889e-06, "learning_rate": 0.000184211587495176, "loss": 0.0, "step": 200 }, { "epoch": 0.6690711410799226, "eval_loss": 5.377663114813913e-08, "eval_runtime": 0.6967, "eval_samples_per_second": 71.762, "eval_steps_per_second": 18.658, "step": 200 }, { "epoch": 0.6724164967853222, "grad_norm": 8.370650903088972e-06, "learning_rate": 0.00018402552249122687, "loss": 0.0, "step": 201 }, { "epoch": 0.6757618524907218, "grad_norm": 7.880154953454621e-06, "learning_rate": 0.0001838384742321625, "loss": 0.0, "step": 202 }, { "epoch": 0.6791072081961215, "grad_norm": 7.438328793796245e-06, "learning_rate": 0.0001836504452024794, "loss": 0.0, "step": 203 }, { "epoch": 0.6824525639015211, "grad_norm": 7.994419320311863e-06, "learning_rate": 0.00018346143789970147, "loss": 0.0, "step": 204 }, { "epoch": 0.6857979196069207, "grad_norm": 7.780427949910518e-06, "learning_rate": 0.00018327145483434647, "loss": 0.0, "step": 205 }, { "epoch": 0.6891432753123203, "grad_norm": 7.139515673770802e-06, "learning_rate": 0.00018308049852989308, "loss": 0.0, "step": 206 }, { "epoch": 0.69248863101772, "grad_norm": 7.582210855616722e-06, "learning_rate": 0.00018288857152274704, "loss": 0.0, "step": 207 }, { "epoch": 0.6958339867231196, "grad_norm": 7.525003638875205e-06, "learning_rate": 0.00018269567636220764, "loss": 0.0, "step": 208 }, { "epoch": 0.6991793424285192, "grad_norm": 8.075163350440562e-06, "learning_rate": 0.0001825018156104338, "loss": 0.0, "step": 209 }, { "epoch": 0.7025246981339188, "grad_norm": 7.745608854747843e-06, "learning_rate": 0.0001823069918424101, "loss": 0.0, "step": 210 }, { "epoch": 0.7058700538393183, "grad_norm": 7.288683718797984e-06, "learning_rate": 0.00018211120764591242, "loss": 0.0, "step": 211 }, { "epoch": 0.709215409544718, "grad_norm": 8.243901902460493e-06, "learning_rate": 0.00018191446562147376, "loss": 0.0, "step": 212 }, { "epoch": 0.7125607652501176, "grad_norm": 8.35837363410974e-06, "learning_rate": 0.00018171676838234964, "loss": 0.0, "step": 213 }, { "epoch": 0.7159061209555172, "grad_norm": 7.451297278748825e-06, "learning_rate": 0.00018151811855448327, "loss": 0.0, "step": 214 }, { "epoch": 0.7192514766609168, "grad_norm": 7.314912181755062e-06, "learning_rate": 0.0001813185187764708, "loss": 0.0, "step": 215 }, { "epoch": 0.7225968323663164, "grad_norm": 7.161909252317855e-06, "learning_rate": 0.00018111797169952632, "loss": 0.0, "step": 216 }, { "epoch": 0.7259421880717161, "grad_norm": 7.5424572969495784e-06, "learning_rate": 0.0001809164799874464, "loss": 0.0, "step": 217 }, { "epoch": 0.7292875437771157, "grad_norm": 7.037332579784561e-06, "learning_rate": 0.00018071404631657504, "loss": 0.0, "step": 218 }, { "epoch": 0.7326328994825153, "grad_norm": 6.815141205152031e-06, "learning_rate": 0.00018051067337576776, "loss": 0.0, "step": 219 }, { "epoch": 0.7359782551879149, "grad_norm": 7.194412773969816e-06, "learning_rate": 0.00018030636386635624, "loss": 0.0, "step": 220 }, { "epoch": 0.7393236108933146, "grad_norm": 7.831909897504374e-06, "learning_rate": 0.00018010112050211222, "loss": 0.0, "step": 221 }, { "epoch": 0.7426689665987142, "grad_norm": 7.283876584551763e-06, "learning_rate": 0.00017989494600921147, "loss": 0.0, "step": 222 }, { "epoch": 0.7460143223041138, "grad_norm": 6.991411737544695e-06, "learning_rate": 0.00017968784312619765, "loss": 0.0, "step": 223 }, { "epoch": 0.7493596780095133, "grad_norm": 6.847162694612052e-06, "learning_rate": 0.0001794798146039459, "loss": 0.0, "step": 224 }, { "epoch": 0.7527050337149129, "grad_norm": 8.925090696720872e-06, "learning_rate": 0.00017927086320562626, "loss": 0.0, "step": 225 }, { "epoch": 0.7527050337149129, "eval_loss": 4.344515502907598e-08, "eval_runtime": 0.6965, "eval_samples_per_second": 71.787, "eval_steps_per_second": 18.665, "step": 225 }, { "epoch": 0.7560503894203126, "grad_norm": 7.99406370788347e-06, "learning_rate": 0.0001790609917066671, "loss": 0.0, "step": 226 }, { "epoch": 0.7593957451257122, "grad_norm": 7.21568358130753e-06, "learning_rate": 0.00017885020289471813, "loss": 0.0, "step": 227 }, { "epoch": 0.7627411008311118, "grad_norm": 6.836844477220438e-06, "learning_rate": 0.0001786384995696133, "loss": 0.0, "step": 228 }, { "epoch": 0.7660864565365114, "grad_norm": 6.983234925428405e-06, "learning_rate": 0.00017842588454333386, "loss": 0.0, "step": 229 }, { "epoch": 0.769431812241911, "grad_norm": 7.0652272370352875e-06, "learning_rate": 0.0001782123606399708, "loss": 0.0, "step": 230 }, { "epoch": 0.7727771679473107, "grad_norm": 6.449628926930018e-06, "learning_rate": 0.00017799793069568743, "loss": 0.0, "step": 231 }, { "epoch": 0.7761225236527103, "grad_norm": 6.692131137242541e-06, "learning_rate": 0.00017778259755868168, "loss": 0.0, "step": 232 }, { "epoch": 0.7794678793581099, "grad_norm": 6.838503395556472e-06, "learning_rate": 0.00017756636408914828, "loss": 0.0, "step": 233 }, { "epoch": 0.7828132350635095, "grad_norm": 7.440746685460908e-06, "learning_rate": 0.0001773492331592407, "loss": 0.0, "step": 234 }, { "epoch": 0.7861585907689091, "grad_norm": 6.901387223479105e-06, "learning_rate": 0.00017713120765303314, "loss": 0.0, "step": 235 }, { "epoch": 0.7895039464743087, "grad_norm": 6.498331458715256e-06, "learning_rate": 0.00017691229046648218, "loss": 0.0, "step": 236 }, { "epoch": 0.7928493021797083, "grad_norm": 7.556504897365812e-06, "learning_rate": 0.0001766924845073881, "loss": 0.0, "step": 237 }, { "epoch": 0.7961946578851079, "grad_norm": 7.73125884734327e-06, "learning_rate": 0.0001764717926953566, "loss": 0.0, "step": 238 }, { "epoch": 0.7995400135905075, "grad_norm": 7.303025540750241e-06, "learning_rate": 0.00017625021796175983, "loss": 0.0, "step": 239 }, { "epoch": 0.8028853692959071, "grad_norm": 6.573012797161937e-06, "learning_rate": 0.00017602776324969748, "loss": 0.0, "step": 240 }, { "epoch": 0.8062307250013068, "grad_norm": 6.783994649595115e-06, "learning_rate": 0.00017580443151395754, "loss": 0.0, "step": 241 }, { "epoch": 0.8095760807067064, "grad_norm": 6.804512850067113e-06, "learning_rate": 0.00017558022572097741, "loss": 0.0, "step": 242 }, { "epoch": 0.812921436412106, "grad_norm": 6.472089808085002e-06, "learning_rate": 0.0001753551488488042, "loss": 0.0, "step": 243 }, { "epoch": 0.8162667921175056, "grad_norm": 6.346880581986625e-06, "learning_rate": 0.00017512920388705518, "loss": 0.0, "step": 244 }, { "epoch": 0.8196121478229053, "grad_norm": 6.590293651242973e-06, "learning_rate": 0.00017490239383687828, "loss": 0.0, "step": 245 }, { "epoch": 0.8229575035283049, "grad_norm": 7.070681931509171e-06, "learning_rate": 0.00017467472171091202, "loss": 0.0, "step": 246 }, { "epoch": 0.8263028592337045, "grad_norm": 6.895753358548973e-06, "learning_rate": 0.0001744461905332456, "loss": 0.0, "step": 247 }, { "epoch": 0.8296482149391041, "grad_norm": 6.528034646180458e-06, "learning_rate": 0.00017421680333937868, "loss": 0.0, "step": 248 }, { "epoch": 0.8329935706445036, "grad_norm": 6.364792625390692e-06, "learning_rate": 0.00017398656317618114, "loss": 0.0, "step": 249 }, { "epoch": 0.8363389263499033, "grad_norm": 7.988395736902021e-06, "learning_rate": 0.00017375547310185247, "loss": 0.0, "step": 250 }, { "epoch": 0.8363389263499033, "eval_loss": 3.814696825088504e-08, "eval_runtime": 0.697, "eval_samples_per_second": 71.733, "eval_steps_per_second": 18.65, "step": 250 }, { "epoch": 0.8396842820553029, "grad_norm": 7.5617986112774815e-06, "learning_rate": 0.00017352353618588128, "loss": 0.0, "step": 251 }, { "epoch": 0.8430296377607025, "grad_norm": 6.9844604695390444e-06, "learning_rate": 0.0001732907555090045, "loss": 0.0, "step": 252 }, { "epoch": 0.8463749934661021, "grad_norm": 6.284686605795287e-06, "learning_rate": 0.00017305713416316637, "loss": 0.0, "step": 253 }, { "epoch": 0.8497203491715017, "grad_norm": 6.63768696540501e-06, "learning_rate": 0.00017282267525147756, "loss": 0.0, "step": 254 }, { "epoch": 0.8530657048769014, "grad_norm": 6.447197847592179e-06, "learning_rate": 0.00017258738188817365, "loss": 0.0, "step": 255 }, { "epoch": 0.856411060582301, "grad_norm": 6.166322691569803e-06, "learning_rate": 0.00017235125719857416, "loss": 0.0, "step": 256 }, { "epoch": 0.8597564162877006, "grad_norm": 6.398379809979815e-06, "learning_rate": 0.0001721143043190407, "loss": 0.0, "step": 257 }, { "epoch": 0.8631017719931002, "grad_norm": 6.675863460259279e-06, "learning_rate": 0.00017187652639693546, "loss": 0.0, "step": 258 }, { "epoch": 0.8664471276984999, "grad_norm": 6.698783181491308e-06, "learning_rate": 0.0001716379265905794, "loss": 0.0, "step": 259 }, { "epoch": 0.8697924834038995, "grad_norm": 6.247206783882575e-06, "learning_rate": 0.00017139850806921023, "loss": 0.0, "step": 260 }, { "epoch": 0.8731378391092991, "grad_norm": 6.243699317565188e-06, "learning_rate": 0.0001711582740129404, "loss": 0.0, "step": 261 }, { "epoch": 0.8764831948146986, "grad_norm": 7.0798228080093395e-06, "learning_rate": 0.0001709172276127149, "loss": 0.0, "step": 262 }, { "epoch": 0.8798285505200982, "grad_norm": 7.133877716114512e-06, "learning_rate": 0.00017067537207026863, "loss": 0.0, "step": 263 }, { "epoch": 0.8831739062254979, "grad_norm": 7.0753931140643544e-06, "learning_rate": 0.0001704327105980842, "loss": 0.0, "step": 264 }, { "epoch": 0.8865192619308975, "grad_norm": 6.391106126102386e-06, "learning_rate": 0.00017018924641934907, "loss": 0.0, "step": 265 }, { "epoch": 0.8898646176362971, "grad_norm": 6.623633908020565e-06, "learning_rate": 0.00016994498276791265, "loss": 0.0, "step": 266 }, { "epoch": 0.8932099733416967, "grad_norm": 6.427198059100192e-06, "learning_rate": 0.00016969992288824365, "loss": 0.0, "step": 267 }, { "epoch": 0.8965553290470963, "grad_norm": 6.255924290599069e-06, "learning_rate": 0.00016945407003538662, "loss": 0.0, "step": 268 }, { "epoch": 0.899900684752496, "grad_norm": 6.1228438426041976e-06, "learning_rate": 0.00016920742747491906, "loss": 0.0, "step": 269 }, { "epoch": 0.9032460404578956, "grad_norm": 6.345067959045991e-06, "learning_rate": 0.0001689599984829078, "loss": 0.0, "step": 270 }, { "epoch": 0.9065913961632952, "grad_norm": 6.835463409515796e-06, "learning_rate": 0.00016871178634586558, "loss": 0.0, "step": 271 }, { "epoch": 0.9099367518686948, "grad_norm": 6.422946171369404e-06, "learning_rate": 0.00016846279436070729, "loss": 0.0, "step": 272 }, { "epoch": 0.9132821075740944, "grad_norm": 6.158682026580209e-06, "learning_rate": 0.00016821302583470645, "loss": 0.0, "step": 273 }, { "epoch": 0.9166274632794941, "grad_norm": 6.219776423677104e-06, "learning_rate": 0.0001679624840854509, "loss": 0.0, "step": 274 }, { "epoch": 0.9199728189848936, "grad_norm": 7.286049822141649e-06, "learning_rate": 0.00016771117244079918, "loss": 0.0, "step": 275 }, { "epoch": 0.9199728189848936, "eval_loss": 3.1789141985427705e-08, "eval_runtime": 0.6972, "eval_samples_per_second": 71.714, "eval_steps_per_second": 18.646, "step": 275 }, { "epoch": 0.9233181746902932, "grad_norm": 7.206275768112391e-06, "learning_rate": 0.0001674590942388358, "loss": 0.0, "step": 276 }, { "epoch": 0.9266635303956928, "grad_norm": 6.634892542933812e-06, "learning_rate": 0.00016720625282782737, "loss": 0.0, "step": 277 }, { "epoch": 0.9300088861010924, "grad_norm": 6.141170615592273e-06, "learning_rate": 0.00016695265156617793, "loss": 0.0, "step": 278 }, { "epoch": 0.9333542418064921, "grad_norm": 6.318032319541089e-06, "learning_rate": 0.00016669829382238424, "loss": 0.0, "step": 279 }, { "epoch": 0.9366995975118917, "grad_norm": 6.153543836262543e-06, "learning_rate": 0.00016644318297499126, "loss": 0.0, "step": 280 }, { "epoch": 0.9400449532172913, "grad_norm": 5.963847797829658e-06, "learning_rate": 0.0001661873224125471, "loss": 0.0, "step": 281 }, { "epoch": 0.9433903089226909, "grad_norm": 6.171669610921526e-06, "learning_rate": 0.0001659307155335581, "loss": 0.0, "step": 282 }, { "epoch": 0.9467356646280906, "grad_norm": 6.437170213757781e-06, "learning_rate": 0.00016567336574644365, "loss": 0.0, "step": 283 }, { "epoch": 0.9500810203334902, "grad_norm": 6.470436346717179e-06, "learning_rate": 0.0001654152764694909, "loss": 0.0, "step": 284 }, { "epoch": 0.9534263760388898, "grad_norm": 6.304750968411099e-06, "learning_rate": 0.0001651564511308095, "loss": 0.0, "step": 285 }, { "epoch": 0.9567717317442894, "grad_norm": 6.059778570488561e-06, "learning_rate": 0.0001648968931682858, "loss": 0.0, "step": 286 }, { "epoch": 0.9601170874496889, "grad_norm": 6.769465471734293e-06, "learning_rate": 0.00016463660602953736, "loss": 0.0, "step": 287 }, { "epoch": 0.9634624431550886, "grad_norm": 6.5449926296423655e-06, "learning_rate": 0.00016437559317186725, "loss": 0.0, "step": 288 }, { "epoch": 0.9668077988604882, "grad_norm": 6.752244189556222e-06, "learning_rate": 0.00016411385806221795, "loss": 0.0, "step": 289 }, { "epoch": 0.9701531545658878, "grad_norm": 6.262746410357067e-06, "learning_rate": 0.00016385140417712527, "loss": 0.0, "step": 290 }, { "epoch": 0.9734985102712874, "grad_norm": 6.260275767999701e-06, "learning_rate": 0.0001635882350026724, "loss": 0.0, "step": 291 }, { "epoch": 0.976843865976687, "grad_norm": 6.173221208882751e-06, "learning_rate": 0.0001633243540344434, "loss": 0.0, "step": 292 }, { "epoch": 0.9801892216820867, "grad_norm": 5.703832357539795e-06, "learning_rate": 0.0001630597647774768, "loss": 0.0, "step": 293 }, { "epoch": 0.9835345773874863, "grad_norm": 5.890248303330736e-06, "learning_rate": 0.00016279447074621917, "loss": 0.0, "step": 294 }, { "epoch": 0.9868799330928859, "grad_norm": 6.308687716227723e-06, "learning_rate": 0.00016252847546447828, "loss": 0.0, "step": 295 }, { "epoch": 0.9902252887982855, "grad_norm": 6.428238066291669e-06, "learning_rate": 0.00016226178246537642, "loss": 0.0, "step": 296 }, { "epoch": 0.9935706445036852, "grad_norm": 6.214699624251807e-06, "learning_rate": 0.00016199439529130335, "loss": 0.0, "step": 297 }, { "epoch": 0.9969160002090848, "grad_norm": 6.178725925565232e-06, "learning_rate": 0.00016172631749386936, "loss": 0.0, "step": 298 }, { "epoch": 1.001149966023731, "grad_norm": 8.854520274326205e-06, "learning_rate": 0.00016145755263385808, "loss": 0.0, "step": 299 }, { "epoch": 1.0044953217291308, "grad_norm": 6.732758720318088e-06, "learning_rate": 0.00016118810428117909, "loss": 0.0, "step": 300 }, { "epoch": 1.0044953217291308, "eval_loss": 2.9404953139078316e-08, "eval_runtime": 0.6966, "eval_samples_per_second": 71.774, "eval_steps_per_second": 18.661, "step": 300 }, { "epoch": 1.0078406774345303, "grad_norm": 6.399338417395484e-06, "learning_rate": 0.00016091797601482056, "loss": 0.0, "step": 301 }, { "epoch": 1.01118603313993, "grad_norm": 6.112524260970531e-06, "learning_rate": 0.00016064717142280174, "loss": 0.0, "step": 302 }, { "epoch": 1.0145313888453296, "grad_norm": 6.190260137373116e-06, "learning_rate": 0.00016037569410212529, "loss": 0.0, "step": 303 }, { "epoch": 1.0178767445507293, "grad_norm": 5.951330422249157e-06, "learning_rate": 0.0001601035476587295, "loss": 0.0, "step": 304 }, { "epoch": 1.0212221002561288, "grad_norm": 5.760415206168545e-06, "learning_rate": 0.00015983073570744033, "loss": 0.0, "step": 305 }, { "epoch": 1.0245674559615283, "grad_norm": 6.1170244407549035e-06, "learning_rate": 0.00015955726187192348, "loss": 0.0, "step": 306 }, { "epoch": 1.027912811666928, "grad_norm": 6.2918843468651175e-06, "learning_rate": 0.0001592831297846362, "loss": 0.0, "step": 307 }, { "epoch": 1.0312581673723276, "grad_norm": 6.401286555046681e-06, "learning_rate": 0.00015900834308677915, "loss": 0.0, "step": 308 }, { "epoch": 1.0346035230777273, "grad_norm": 6.1624609770660754e-06, "learning_rate": 0.00015873290542824786, "loss": 0.0, "step": 309 }, { "epoch": 1.0379488787831268, "grad_norm": 6.015520739310887e-06, "learning_rate": 0.00015845682046758438, "loss": 0.0, "step": 310 }, { "epoch": 1.0412942344885265, "grad_norm": 6.757708433724474e-06, "learning_rate": 0.00015818009187192864, "loss": 0.0, "step": 311 }, { "epoch": 1.044639590193926, "grad_norm": 6.69797873342759e-06, "learning_rate": 0.0001579027233169698, "loss": 0.0, "step": 312 }, { "epoch": 1.0479849458993258, "grad_norm": 6.269850018725265e-06, "learning_rate": 0.00015762471848689726, "loss": 0.0, "step": 313 }, { "epoch": 1.0513303016047253, "grad_norm": 5.965060609014472e-06, "learning_rate": 0.00015734608107435198, "loss": 0.0, "step": 314 }, { "epoch": 1.054675657310125, "grad_norm": 6.1035466387693305e-06, "learning_rate": 0.00015706681478037718, "loss": 0.0, "step": 315 }, { "epoch": 1.0580210130155245, "grad_norm": 6.107516128395218e-06, "learning_rate": 0.00015678692331436934, "loss": 0.0, "step": 316 }, { "epoch": 1.061366368720924, "grad_norm": 5.722908099414781e-06, "learning_rate": 0.00015650641039402884, "loss": 0.0, "step": 317 }, { "epoch": 1.0647117244263238, "grad_norm": 6.076377303543268e-06, "learning_rate": 0.00015622527974531073, "loss": 0.0, "step": 318 }, { "epoch": 1.0680570801317233, "grad_norm": 6.243446023290744e-06, "learning_rate": 0.000155943535102375, "loss": 0.0, "step": 319 }, { "epoch": 1.071402435837123, "grad_norm": 6.261637281568255e-06, "learning_rate": 0.00015566118020753718, "loss": 0.0, "step": 320 }, { "epoch": 1.0747477915425225, "grad_norm": 6.09951439400902e-06, "learning_rate": 0.00015537821881121854, "loss": 0.0, "step": 321 }, { "epoch": 1.0780931472479223, "grad_norm": 6.236603894649306e-06, "learning_rate": 0.00015509465467189633, "loss": 0.0, "step": 322 }, { "epoch": 1.0814385029533218, "grad_norm": 6.094633135944605e-06, "learning_rate": 0.00015481049155605377, "loss": 0.0, "step": 323 }, { "epoch": 1.0847838586587215, "grad_norm": 6.641942945861956e-06, "learning_rate": 0.00015452573323813015, "loss": 0.0, "step": 324 }, { "epoch": 1.088129214364121, "grad_norm": 6.2910153246775735e-06, "learning_rate": 0.0001542403835004705, "loss": 0.0, "step": 325 }, { "epoch": 1.088129214364121, "eval_loss": 2.2517308906344624e-08, "eval_runtime": 0.6966, "eval_samples_per_second": 71.772, "eval_steps_per_second": 18.661, "step": 325 }, { "epoch": 1.0914745700695208, "grad_norm": 6.234090051293606e-06, "learning_rate": 0.00015395444613327562, "loss": 0.0, "step": 326 }, { "epoch": 1.0948199257749203, "grad_norm": 5.963534931652248e-06, "learning_rate": 0.00015366792493455147, "loss": 0.0, "step": 327 }, { "epoch": 1.09816528148032, "grad_norm": 5.969760422885884e-06, "learning_rate": 0.00015338082371005895, "loss": 0.0, "step": 328 }, { "epoch": 1.1015106371857195, "grad_norm": 5.873356258234708e-06, "learning_rate": 0.00015309314627326307, "loss": 0.0, "step": 329 }, { "epoch": 1.1048559928911192, "grad_norm": 5.8633954722608905e-06, "learning_rate": 0.00015280489644528265, "loss": 0.0, "step": 330 }, { "epoch": 1.1082013485965188, "grad_norm": 6.042057520971866e-06, "learning_rate": 0.00015251607805483929, "loss": 0.0, "step": 331 }, { "epoch": 1.1115467043019183, "grad_norm": 6.330176347546512e-06, "learning_rate": 0.0001522266949382066, "loss": 0.0, "step": 332 }, { "epoch": 1.114892060007318, "grad_norm": 6.0213574215595145e-06, "learning_rate": 0.00015193675093915927, "loss": 0.0, "step": 333 }, { "epoch": 1.1182374157127175, "grad_norm": 6.2009839894017205e-06, "learning_rate": 0.00015164624990892203, "loss": 0.0, "step": 334 }, { "epoch": 1.1215827714181172, "grad_norm": 5.93813774685259e-06, "learning_rate": 0.00015135519570611835, "loss": 0.0, "step": 335 }, { "epoch": 1.1249281271235168, "grad_norm": 6.523011961689917e-06, "learning_rate": 0.00015106359219671945, "loss": 0.0, "step": 336 }, { "epoch": 1.1282734828289165, "grad_norm": 6.297024356172187e-06, "learning_rate": 0.00015077144325399266, "loss": 0.0, "step": 337 }, { "epoch": 1.131618838534316, "grad_norm": 6.139112883829512e-06, "learning_rate": 0.00015047875275845025, "loss": 0.0, "step": 338 }, { "epoch": 1.1349641942397157, "grad_norm": 5.81321955905878e-06, "learning_rate": 0.00015018552459779755, "loss": 0.0, "step": 339 }, { "epoch": 1.1383095499451152, "grad_norm": 6.023450623615645e-06, "learning_rate": 0.00014989176266688175, "loss": 0.0, "step": 340 }, { "epoch": 1.141654905650515, "grad_norm": 5.928295195190003e-06, "learning_rate": 0.00014959747086763968, "loss": 0.0, "step": 341 }, { "epoch": 1.1450002613559145, "grad_norm": 5.596015853370773e-06, "learning_rate": 0.00014930265310904642, "loss": 0.0, "step": 342 }, { "epoch": 1.148345617061314, "grad_norm": 5.865449111297494e-06, "learning_rate": 0.0001490073133070631, "loss": 0.0, "step": 343 }, { "epoch": 1.1516909727667137, "grad_norm": 6.43144949208363e-06, "learning_rate": 0.00014871145538458493, "loss": 0.0, "step": 344 }, { "epoch": 1.1550363284721132, "grad_norm": 6.355532605084591e-06, "learning_rate": 0.0001484150832713892, "loss": 0.0, "step": 345 }, { "epoch": 1.158381684177513, "grad_norm": 6.2208705458033364e-06, "learning_rate": 0.00014811820090408306, "loss": 0.0, "step": 346 }, { "epoch": 1.1617270398829125, "grad_norm": 6.019817192282062e-06, "learning_rate": 0.00014782081222605104, "loss": 0.0, "step": 347 }, { "epoch": 1.1650723955883122, "grad_norm": 6.237503839656711e-06, "learning_rate": 0.00014752292118740292, "loss": 0.0, "step": 348 }, { "epoch": 1.1684177512937117, "grad_norm": 6.180914624565048e-06, "learning_rate": 0.00014722453174492118, "loss": 0.0, "step": 349 }, { "epoch": 1.1717631069991115, "grad_norm": 6.202710665093036e-06, "learning_rate": 0.0001469256478620083, "loss": 0.0, "step": 350 }, { "epoch": 1.1717631069991115, "eval_loss": 1.9603303869075717e-08, "eval_runtime": 0.696, "eval_samples_per_second": 71.844, "eval_steps_per_second": 18.68, "step": 350 }, { "epoch": 1.175108462704511, "grad_norm": 6.101056442275876e-06, "learning_rate": 0.0001466262735086344, "loss": 0.0, "step": 351 }, { "epoch": 1.1784538184099107, "grad_norm": 5.789248461951502e-06, "learning_rate": 0.00014632641266128428, "loss": 0.0, "step": 352 }, { "epoch": 1.1817991741153102, "grad_norm": 5.922979653405491e-06, "learning_rate": 0.00014602606930290456, "loss": 0.0, "step": 353 }, { "epoch": 1.1851445298207097, "grad_norm": 5.770634743385017e-06, "learning_rate": 0.0001457252474228511, "loss": 0.0, "step": 354 }, { "epoch": 1.1884898855261095, "grad_norm": 5.891377441002987e-06, "learning_rate": 0.00014542395101683561, "loss": 0.0, "step": 355 }, { "epoch": 1.1918352412315092, "grad_norm": 5.779205366707174e-06, "learning_rate": 0.00014512218408687286, "loss": 0.0, "step": 356 }, { "epoch": 1.1951805969369087, "grad_norm": 6.2929498199082445e-06, "learning_rate": 0.0001448199506412274, "loss": 0.0, "step": 357 }, { "epoch": 1.1985259526423082, "grad_norm": 5.800653980259085e-06, "learning_rate": 0.00014451725469436037, "loss": 0.0, "step": 358 }, { "epoch": 1.201871308347708, "grad_norm": 6.026995379215805e-06, "learning_rate": 0.00014421410026687609, "loss": 0.0, "step": 359 }, { "epoch": 1.2052166640531075, "grad_norm": 6.040312655386515e-06, "learning_rate": 0.00014391049138546872, "loss": 0.0, "step": 360 }, { "epoch": 1.2085620197585072, "grad_norm": 6.384302650985774e-06, "learning_rate": 0.00014360643208286887, "loss": 0.0, "step": 361 }, { "epoch": 1.2119073754639067, "grad_norm": 6.054286586731905e-06, "learning_rate": 0.00014330192639778986, "loss": 0.0, "step": 362 }, { "epoch": 1.2152527311693064, "grad_norm": 5.83691962674493e-06, "learning_rate": 0.00014299697837487414, "loss": 0.0, "step": 363 }, { "epoch": 1.218598086874706, "grad_norm": 5.789472197648138e-06, "learning_rate": 0.0001426915920646396, "loss": 0.0, "step": 364 }, { "epoch": 1.2219434425801057, "grad_norm": 6.030274562363047e-06, "learning_rate": 0.0001423857715234258, "loss": 0.0, "step": 365 }, { "epoch": 1.2252887982855052, "grad_norm": 6.083088010200299e-06, "learning_rate": 0.00014207952081333992, "loss": 0.0, "step": 366 }, { "epoch": 1.228634153990905, "grad_norm": 5.530882390303304e-06, "learning_rate": 0.00014177284400220306, "loss": 0.0, "step": 367 }, { "epoch": 1.2319795096963044, "grad_norm": 5.863201749889413e-06, "learning_rate": 0.00014146574516349595, "loss": 0.0, "step": 368 }, { "epoch": 1.235324865401704, "grad_norm": 6.111798938945867e-06, "learning_rate": 0.000141158228376305, "loss": 0.0, "step": 369 }, { "epoch": 1.2386702211071037, "grad_norm": 6.045273494237335e-06, "learning_rate": 0.00014085029772526814, "loss": 0.0, "step": 370 }, { "epoch": 1.2420155768125032, "grad_norm": 6.002165719110053e-06, "learning_rate": 0.0001405419573005205, "loss": 0.0, "step": 371 }, { "epoch": 1.245360932517903, "grad_norm": 6.039120307832491e-06, "learning_rate": 0.00014023321119764002, "loss": 0.0, "step": 372 }, { "epoch": 1.2487062882233024, "grad_norm": 6.003551789035555e-06, "learning_rate": 0.0001399240635175932, "loss": 0.0, "step": 373 }, { "epoch": 1.2520516439287022, "grad_norm": 5.915948804613436e-06, "learning_rate": 0.00013961451836668043, "loss": 0.0, "step": 374 }, { "epoch": 1.2553969996341017, "grad_norm": 5.859969860466663e-06, "learning_rate": 0.00013930457985648168, "loss": 0.0, "step": 375 }, { "epoch": 1.2553969996341017, "eval_loss": 1.7748938319073204e-08, "eval_runtime": 0.6961, "eval_samples_per_second": 71.824, "eval_steps_per_second": 18.674, "step": 375 }, { "epoch": 1.2587423553395014, "grad_norm": 5.691527803719509e-06, "learning_rate": 0.00013899425210380176, "loss": 0.0, "step": 376 }, { "epoch": 1.262087711044901, "grad_norm": 5.914226676395629e-06, "learning_rate": 0.00013868353923061563, "loss": 0.0, "step": 377 }, { "epoch": 1.2654330667503007, "grad_norm": 5.989952114759944e-06, "learning_rate": 0.0001383724453640137, "loss": 0.0, "step": 378 }, { "epoch": 1.2687784224557002, "grad_norm": 5.64041874895338e-06, "learning_rate": 0.00013806097463614692, "loss": 0.0, "step": 379 }, { "epoch": 1.2721237781610997, "grad_norm": 6.035153546690708e-06, "learning_rate": 0.00013774913118417195, "loss": 0.0, "step": 380 }, { "epoch": 1.2754691338664994, "grad_norm": 5.875488113815663e-06, "learning_rate": 0.0001374369191501963, "loss": 0.0, "step": 381 }, { "epoch": 1.2788144895718991, "grad_norm": 6.396473509084899e-06, "learning_rate": 0.00013712434268122324, "loss": 0.0, "step": 382 }, { "epoch": 1.2821598452772986, "grad_norm": 5.975215117359767e-06, "learning_rate": 0.00013681140592909652, "loss": 0.0, "step": 383 }, { "epoch": 1.2855052009826982, "grad_norm": 5.922964191995561e-06, "learning_rate": 0.00013649811305044558, "loss": 0.0, "step": 384 }, { "epoch": 1.288850556688098, "grad_norm": 5.88319653616054e-06, "learning_rate": 0.00013618446820663015, "loss": 0.0, "step": 385 }, { "epoch": 1.2921959123934974, "grad_norm": 6.589156328118406e-06, "learning_rate": 0.00013587047556368493, "loss": 0.0, "step": 386 }, { "epoch": 1.2955412680988971, "grad_norm": 5.7994789131043945e-06, "learning_rate": 0.00013555613929226433, "loss": 0.0, "step": 387 }, { "epoch": 1.2988866238042966, "grad_norm": 5.76379807171179e-06, "learning_rate": 0.00013524146356758704, "loss": 0.0, "step": 388 }, { "epoch": 1.3022319795096964, "grad_norm": 5.537129709409783e-06, "learning_rate": 0.00013492645256938068, "loss": 0.0, "step": 389 }, { "epoch": 1.305577335215096, "grad_norm": 5.805138698633527e-06, "learning_rate": 0.00013461111048182608, "loss": 0.0, "step": 390 }, { "epoch": 1.3089226909204954, "grad_norm": 5.993603735987563e-06, "learning_rate": 0.00013429544149350187, "loss": 0.0, "step": 391 }, { "epoch": 1.3122680466258951, "grad_norm": 5.531137048819801e-06, "learning_rate": 0.00013397944979732872, "loss": 0.0, "step": 392 }, { "epoch": 1.3156134023312949, "grad_norm": 5.714673534384929e-06, "learning_rate": 0.00013366313959051383, "loss": 0.0, "step": 393 }, { "epoch": 1.3189587580366944, "grad_norm": 5.987948497931939e-06, "learning_rate": 0.000133346515074495, "loss": 0.0, "step": 394 }, { "epoch": 1.322304113742094, "grad_norm": 6.1729906519758515e-06, "learning_rate": 0.00013302958045488493, "loss": 0.0, "step": 395 }, { "epoch": 1.3256494694474936, "grad_norm": 5.74833165956079e-06, "learning_rate": 0.00013271233994141516, "loss": 0.0, "step": 396 }, { "epoch": 1.3289948251528931, "grad_norm": 5.835729098180309e-06, "learning_rate": 0.0001323947977478806, "loss": 0.0, "step": 397 }, { "epoch": 1.3323401808582929, "grad_norm": 5.932717158430023e-06, "learning_rate": 0.00013207695809208295, "loss": 0.0, "step": 398 }, { "epoch": 1.3356855365636924, "grad_norm": 6.128618224465754e-06, "learning_rate": 0.00013175882519577526, "loss": 0.0, "step": 399 }, { "epoch": 1.339030892269092, "grad_norm": 5.671974122378742e-06, "learning_rate": 0.00013144040328460545, "loss": 0.0, "step": 400 }, { "epoch": 1.339030892269092, "eval_loss": 1.7484028447256605e-08, "eval_runtime": 0.6964, "eval_samples_per_second": 71.794, "eval_steps_per_second": 18.666, "step": 400 } ], "logging_steps": 1, "max_steps": 897, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.518464884165837e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }