diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6711 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 95491, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.001047219109654313, + "grad_norm": 6.445349216461182, + "learning_rate": 5.238344683080147e-08, + "loss": 1.2293, + "step": 100 + }, + { + "epoch": 0.002094438219308626, + "grad_norm": 7.579877853393555, + "learning_rate": 1.0476689366160294e-07, + "loss": 1.2053, + "step": 200 + }, + { + "epoch": 0.003141657328962939, + "grad_norm": 5.277140140533447, + "learning_rate": 1.5715034049240438e-07, + "loss": 1.1086, + "step": 300 + }, + { + "epoch": 0.004188876438617252, + "grad_norm": 3.0632076263427734, + "learning_rate": 2.0953378732320588e-07, + "loss": 1.0615, + "step": 400 + }, + { + "epoch": 0.005236095548271565, + "grad_norm": 8.091245651245117, + "learning_rate": 2.6191723415400735e-07, + "loss": 0.9659, + "step": 500 + }, + { + "epoch": 0.006283314657925878, + "grad_norm": 2.5814743041992188, + "learning_rate": 3.1430068098480877e-07, + "loss": 0.9656, + "step": 600 + }, + { + "epoch": 0.007330533767580191, + "grad_norm": 8.025248527526855, + "learning_rate": 3.6668412781561024e-07, + "loss": 0.9068, + "step": 700 + }, + { + "epoch": 0.008377752877234504, + "grad_norm": 2.8023085594177246, + "learning_rate": 4.1906757464641176e-07, + "loss": 0.8278, + "step": 800 + }, + { + "epoch": 0.009424971986888816, + "grad_norm": 2.9815306663513184, + "learning_rate": 4.714510214772132e-07, + "loss": 0.8097, + "step": 900 + }, + { + "epoch": 0.01047219109654313, + "grad_norm": 4.450624465942383, + "learning_rate": 5.238344683080147e-07, + "loss": 0.8611, + "step": 1000 + }, + { + "epoch": 0.011519410206197442, + "grad_norm": 2.9705615043640137, + "learning_rate": 5.762179151388162e-07, + "loss": 0.8217, + "step": 1100 + }, + { + "epoch": 0.012566629315851756, + "grad_norm": 5.060612678527832, + "learning_rate": 6.286013619696175e-07, + "loss": 0.8326, + "step": 1200 + }, + { + "epoch": 0.01361384842550607, + "grad_norm": 4.002683639526367, + "learning_rate": 6.809848088004191e-07, + "loss": 0.7742, + "step": 1300 + }, + { + "epoch": 0.014661067535160381, + "grad_norm": 3.3899588584899902, + "learning_rate": 7.333682556312205e-07, + "loss": 0.7594, + "step": 1400 + }, + { + "epoch": 0.015708286644814693, + "grad_norm": 4.091441631317139, + "learning_rate": 7.857517024620219e-07, + "loss": 0.7871, + "step": 1500 + }, + { + "epoch": 0.01675550575446901, + "grad_norm": 3.302689790725708, + "learning_rate": 8.381351492928235e-07, + "loss": 0.721, + "step": 1600 + }, + { + "epoch": 0.01780272486412332, + "grad_norm": 3.8457956314086914, + "learning_rate": 8.905185961236249e-07, + "loss": 0.6789, + "step": 1700 + }, + { + "epoch": 0.018849943973777632, + "grad_norm": 3.763422727584839, + "learning_rate": 9.429020429544264e-07, + "loss": 0.7035, + "step": 1800 + }, + { + "epoch": 0.019897163083431948, + "grad_norm": 2.3855648040771484, + "learning_rate": 9.95285489785228e-07, + "loss": 0.7331, + "step": 1900 + }, + { + "epoch": 0.02094438219308626, + "grad_norm": 3.0932857990264893, + "learning_rate": 9.999976668774249e-07, + "loss": 0.7123, + "step": 2000 + }, + { + "epoch": 0.02199160130274057, + "grad_norm": 2.939152956008911, + "learning_rate": 9.999897217221058e-07, + "loss": 0.6106, + "step": 2100 + }, + { + "epoch": 0.023038820412394884, + "grad_norm": 2.148160934448242, + "learning_rate": 9.999761418022958e-07, + "loss": 0.6828, + "step": 2200 + }, + { + "epoch": 0.0240860395220492, + "grad_norm": 2.302873134613037, + "learning_rate": 9.999569272710377e-07, + "loss": 0.6691, + "step": 2300 + }, + { + "epoch": 0.02513325863170351, + "grad_norm": 4.346377372741699, + "learning_rate": 9.999320783448744e-07, + "loss": 0.6698, + "step": 2400 + }, + { + "epoch": 0.026180477741357823, + "grad_norm": 2.157055616378784, + "learning_rate": 9.999015953038474e-07, + "loss": 0.6019, + "step": 2500 + }, + { + "epoch": 0.02722769685101214, + "grad_norm": 2.7303714752197266, + "learning_rate": 9.998654784914935e-07, + "loss": 0.5972, + "step": 2600 + }, + { + "epoch": 0.02827491596066645, + "grad_norm": 4.359681606292725, + "learning_rate": 9.9982372831484e-07, + "loss": 0.6381, + "step": 2700 + }, + { + "epoch": 0.029322135070320762, + "grad_norm": 3.2993288040161133, + "learning_rate": 9.997763452444018e-07, + "loss": 0.6093, + "step": 2800 + }, + { + "epoch": 0.030369354179975078, + "grad_norm": 3.061521053314209, + "learning_rate": 9.99723329814175e-07, + "loss": 0.6875, + "step": 2900 + }, + { + "epoch": 0.031416573289629386, + "grad_norm": 2.3765642642974854, + "learning_rate": 9.996646826216302e-07, + "loss": 0.6031, + "step": 3000 + }, + { + "epoch": 0.0324637923992837, + "grad_norm": 2.144615411758423, + "learning_rate": 9.996004043277078e-07, + "loss": 0.637, + "step": 3100 + }, + { + "epoch": 0.03351101150893802, + "grad_norm": 3.2836430072784424, + "learning_rate": 9.995304956568083e-07, + "loss": 0.6425, + "step": 3200 + }, + { + "epoch": 0.034558230618592325, + "grad_norm": 2.8710663318634033, + "learning_rate": 9.99454957396786e-07, + "loss": 0.6199, + "step": 3300 + }, + { + "epoch": 0.03560544972824664, + "grad_norm": 2.5998404026031494, + "learning_rate": 9.993737903989387e-07, + "loss": 0.5903, + "step": 3400 + }, + { + "epoch": 0.036652668837900956, + "grad_norm": 2.677945613861084, + "learning_rate": 9.992869955779995e-07, + "loss": 0.6473, + "step": 3500 + }, + { + "epoch": 0.037699887947555265, + "grad_norm": 3.9936769008636475, + "learning_rate": 9.991945739121251e-07, + "loss": 0.5847, + "step": 3600 + }, + { + "epoch": 0.03874710705720958, + "grad_norm": 2.839268207550049, + "learning_rate": 9.990965264428851e-07, + "loss": 0.5893, + "step": 3700 + }, + { + "epoch": 0.039794326166863896, + "grad_norm": 2.4763646125793457, + "learning_rate": 9.989928542752516e-07, + "loss": 0.5865, + "step": 3800 + }, + { + "epoch": 0.040841545276518204, + "grad_norm": 4.822995662689209, + "learning_rate": 9.98883558577585e-07, + "loss": 0.579, + "step": 3900 + }, + { + "epoch": 0.04188876438617252, + "grad_norm": 2.6188089847564697, + "learning_rate": 9.987686405816216e-07, + "loss": 0.6065, + "step": 4000 + }, + { + "epoch": 0.042935983495826835, + "grad_norm": 2.550874710083008, + "learning_rate": 9.986481015824592e-07, + "loss": 0.5911, + "step": 4100 + }, + { + "epoch": 0.04398320260548114, + "grad_norm": 2.973268985748291, + "learning_rate": 9.985219429385443e-07, + "loss": 0.6216, + "step": 4200 + }, + { + "epoch": 0.04503042171513546, + "grad_norm": 6.536316394805908, + "learning_rate": 9.98390166071654e-07, + "loss": 0.5904, + "step": 4300 + }, + { + "epoch": 0.04607764082478977, + "grad_norm": 2.6079025268554688, + "learning_rate": 9.982527724668825e-07, + "loss": 0.5942, + "step": 4400 + }, + { + "epoch": 0.04712485993444408, + "grad_norm": 2.2787749767303467, + "learning_rate": 9.981097636726227e-07, + "loss": 0.6174, + "step": 4500 + }, + { + "epoch": 0.0481720790440984, + "grad_norm": 1.995902419090271, + "learning_rate": 9.979611413005493e-07, + "loss": 0.5698, + "step": 4600 + }, + { + "epoch": 0.04921929815375271, + "grad_norm": 3.4670004844665527, + "learning_rate": 9.97806907025601e-07, + "loss": 0.5871, + "step": 4700 + }, + { + "epoch": 0.05026651726340702, + "grad_norm": 2.329735279083252, + "learning_rate": 9.97647062585961e-07, + "loss": 0.6061, + "step": 4800 + }, + { + "epoch": 0.05131373637306134, + "grad_norm": 2.4299092292785645, + "learning_rate": 9.97481609783038e-07, + "loss": 0.5944, + "step": 4900 + }, + { + "epoch": 0.052360955482715646, + "grad_norm": 4.186954498291016, + "learning_rate": 9.973105504814458e-07, + "loss": 0.6131, + "step": 5000 + }, + { + "epoch": 0.05340817459236996, + "grad_norm": 2.038557767868042, + "learning_rate": 9.971338866089812e-07, + "loss": 0.5668, + "step": 5100 + }, + { + "epoch": 0.05445539370202428, + "grad_norm": 2.6505930423736572, + "learning_rate": 9.96951620156604e-07, + "loss": 0.5697, + "step": 5200 + }, + { + "epoch": 0.055502612811678585, + "grad_norm": 3.494474411010742, + "learning_rate": 9.967637531784138e-07, + "loss": 0.6061, + "step": 5300 + }, + { + "epoch": 0.0565498319213329, + "grad_norm": 1.573089599609375, + "learning_rate": 9.965702877916262e-07, + "loss": 0.5714, + "step": 5400 + }, + { + "epoch": 0.057597051030987216, + "grad_norm": 3.103743553161621, + "learning_rate": 9.963712261765495e-07, + "loss": 0.6045, + "step": 5500 + }, + { + "epoch": 0.058644270140641525, + "grad_norm": 2.182767152786255, + "learning_rate": 9.96166570576561e-07, + "loss": 0.6209, + "step": 5600 + }, + { + "epoch": 0.05969148925029584, + "grad_norm": 2.818512439727783, + "learning_rate": 9.959563232980801e-07, + "loss": 0.5825, + "step": 5700 + }, + { + "epoch": 0.060738708359950155, + "grad_norm": 6.24643611907959, + "learning_rate": 9.957404867105435e-07, + "loss": 0.5645, + "step": 5800 + }, + { + "epoch": 0.061785927469604464, + "grad_norm": 2.866800308227539, + "learning_rate": 9.955190632463774e-07, + "loss": 0.5826, + "step": 5900 + }, + { + "epoch": 0.06283314657925877, + "grad_norm": 1.9323431253433228, + "learning_rate": 9.952920554009715e-07, + "loss": 0.5706, + "step": 6000 + }, + { + "epoch": 0.06388036568891309, + "grad_norm": 2.389801263809204, + "learning_rate": 9.9505946573265e-07, + "loss": 0.5888, + "step": 6100 + }, + { + "epoch": 0.0649275847985674, + "grad_norm": 2.6937005519866943, + "learning_rate": 9.948212968626429e-07, + "loss": 0.5848, + "step": 6200 + }, + { + "epoch": 0.06597480390822172, + "grad_norm": 3.2649362087249756, + "learning_rate": 9.945775514750558e-07, + "loss": 0.5746, + "step": 6300 + }, + { + "epoch": 0.06702202301787603, + "grad_norm": 3.9703376293182373, + "learning_rate": 9.943282323168416e-07, + "loss": 0.5219, + "step": 6400 + }, + { + "epoch": 0.06806924212753035, + "grad_norm": 3.0078823566436768, + "learning_rate": 9.94073342197767e-07, + "loss": 0.5867, + "step": 6500 + }, + { + "epoch": 0.06911646123718465, + "grad_norm": 2.0793182849884033, + "learning_rate": 9.938128839903829e-07, + "loss": 0.5757, + "step": 6600 + }, + { + "epoch": 0.07016368034683897, + "grad_norm": 1.7143627405166626, + "learning_rate": 9.935468606299908e-07, + "loss": 0.5753, + "step": 6700 + }, + { + "epoch": 0.07121089945649328, + "grad_norm": 1.6375339031219482, + "learning_rate": 9.932752751146102e-07, + "loss": 0.5875, + "step": 6800 + }, + { + "epoch": 0.0722581185661476, + "grad_norm": 3.0804569721221924, + "learning_rate": 9.929981305049452e-07, + "loss": 0.5399, + "step": 6900 + }, + { + "epoch": 0.07330533767580191, + "grad_norm": 1.8709744215011597, + "learning_rate": 9.92715429924349e-07, + "loss": 0.5555, + "step": 7000 + }, + { + "epoch": 0.07435255678545621, + "grad_norm": 2.213629722595215, + "learning_rate": 9.924271765587897e-07, + "loss": 0.5536, + "step": 7100 + }, + { + "epoch": 0.07539977589511053, + "grad_norm": 1.5812900066375732, + "learning_rate": 9.921333736568133e-07, + "loss": 0.5973, + "step": 7200 + }, + { + "epoch": 0.07644699500476484, + "grad_norm": 1.2580069303512573, + "learning_rate": 9.918340245295086e-07, + "loss": 0.549, + "step": 7300 + }, + { + "epoch": 0.07749421411441916, + "grad_norm": 3.4917242527008057, + "learning_rate": 9.915291325504685e-07, + "loss": 0.5493, + "step": 7400 + }, + { + "epoch": 0.07854143322407348, + "grad_norm": 3.6106157302856445, + "learning_rate": 9.912187011557523e-07, + "loss": 0.5367, + "step": 7500 + }, + { + "epoch": 0.07958865233372779, + "grad_norm": 2.585413694381714, + "learning_rate": 9.90902733843848e-07, + "loss": 0.5242, + "step": 7600 + }, + { + "epoch": 0.08063587144338209, + "grad_norm": 2.1417288780212402, + "learning_rate": 9.905812341756314e-07, + "loss": 0.5657, + "step": 7700 + }, + { + "epoch": 0.08168309055303641, + "grad_norm": 2.6701626777648926, + "learning_rate": 9.902542057743267e-07, + "loss": 0.533, + "step": 7800 + }, + { + "epoch": 0.08273030966269072, + "grad_norm": 2.7961204051971436, + "learning_rate": 9.899216523254657e-07, + "loss": 0.5833, + "step": 7900 + }, + { + "epoch": 0.08377752877234504, + "grad_norm": 3.9673585891723633, + "learning_rate": 9.895835775768464e-07, + "loss": 0.5548, + "step": 8000 + }, + { + "epoch": 0.08482474788199935, + "grad_norm": 2.384716272354126, + "learning_rate": 9.892399853384903e-07, + "loss": 0.5802, + "step": 8100 + }, + { + "epoch": 0.08587196699165367, + "grad_norm": 2.7740979194641113, + "learning_rate": 9.888908794825994e-07, + "loss": 0.5565, + "step": 8200 + }, + { + "epoch": 0.08691918610130797, + "grad_norm": 2.4571990966796875, + "learning_rate": 9.885362639435133e-07, + "loss": 0.5538, + "step": 8300 + }, + { + "epoch": 0.08796640521096229, + "grad_norm": 2.063465118408203, + "learning_rate": 9.88176142717664e-07, + "loss": 0.603, + "step": 8400 + }, + { + "epoch": 0.0890136243206166, + "grad_norm": 1.9801498651504517, + "learning_rate": 9.878105198635321e-07, + "loss": 0.5479, + "step": 8500 + }, + { + "epoch": 0.09006084343027092, + "grad_norm": 2.044619083404541, + "learning_rate": 9.87439399501599e-07, + "loss": 0.5446, + "step": 8600 + }, + { + "epoch": 0.09110806253992523, + "grad_norm": 2.573242664337158, + "learning_rate": 9.87062785814303e-07, + "loss": 0.5347, + "step": 8700 + }, + { + "epoch": 0.09215528164957953, + "grad_norm": 2.520949125289917, + "learning_rate": 9.866806830459898e-07, + "loss": 0.5467, + "step": 8800 + }, + { + "epoch": 0.09320250075923385, + "grad_norm": 2.924830913543701, + "learning_rate": 9.86293095502866e-07, + "loss": 0.5187, + "step": 8900 + }, + { + "epoch": 0.09424971986888817, + "grad_norm": 2.2049362659454346, + "learning_rate": 9.859000275529507e-07, + "loss": 0.5549, + "step": 9000 + }, + { + "epoch": 0.09529693897854248, + "grad_norm": 2.932223320007324, + "learning_rate": 9.855014836260256e-07, + "loss": 0.5723, + "step": 9100 + }, + { + "epoch": 0.0963441580881968, + "grad_norm": 2.659306526184082, + "learning_rate": 9.850974682135855e-07, + "loss": 0.5471, + "step": 9200 + }, + { + "epoch": 0.09739137719785111, + "grad_norm": 3.1078333854675293, + "learning_rate": 9.84687985868787e-07, + "loss": 0.5498, + "step": 9300 + }, + { + "epoch": 0.09843859630750541, + "grad_norm": 2.73991322517395, + "learning_rate": 9.842730412063984e-07, + "loss": 0.5509, + "step": 9400 + }, + { + "epoch": 0.09948581541715973, + "grad_norm": 2.288360595703125, + "learning_rate": 9.83852638902747e-07, + "loss": 0.5311, + "step": 9500 + }, + { + "epoch": 0.10053303452681404, + "grad_norm": 2.391042947769165, + "learning_rate": 9.834267836956652e-07, + "loss": 0.569, + "step": 9600 + }, + { + "epoch": 0.10158025363646836, + "grad_norm": 2.225496292114258, + "learning_rate": 9.829954803844404e-07, + "loss": 0.5432, + "step": 9700 + }, + { + "epoch": 0.10262747274612267, + "grad_norm": 1.877164363861084, + "learning_rate": 9.82558733829757e-07, + "loss": 0.5795, + "step": 9800 + }, + { + "epoch": 0.10367469185577699, + "grad_norm": 2.455549478530884, + "learning_rate": 9.82116548953644e-07, + "loss": 0.577, + "step": 9900 + }, + { + "epoch": 0.10472191096543129, + "grad_norm": 3.1859889030456543, + "learning_rate": 9.816689307394198e-07, + "loss": 0.5742, + "step": 10000 + }, + { + "epoch": 0.10576913007508561, + "grad_norm": 2.9405317306518555, + "learning_rate": 9.812158842316341e-07, + "loss": 0.5674, + "step": 10100 + }, + { + "epoch": 0.10681634918473992, + "grad_norm": 2.1740851402282715, + "learning_rate": 9.807574145360125e-07, + "loss": 0.5219, + "step": 10200 + }, + { + "epoch": 0.10786356829439424, + "grad_norm": 2.1551525592803955, + "learning_rate": 9.80293526819399e-07, + "loss": 0.5378, + "step": 10300 + }, + { + "epoch": 0.10891078740404855, + "grad_norm": 1.479442834854126, + "learning_rate": 9.798242263096968e-07, + "loss": 0.5137, + "step": 10400 + }, + { + "epoch": 0.10995800651370287, + "grad_norm": 2.2272469997406006, + "learning_rate": 9.793495182958107e-07, + "loss": 0.5469, + "step": 10500 + }, + { + "epoch": 0.11100522562335717, + "grad_norm": 1.9610800743103027, + "learning_rate": 9.78869408127586e-07, + "loss": 0.5685, + "step": 10600 + }, + { + "epoch": 0.11205244473301149, + "grad_norm": 2.2086081504821777, + "learning_rate": 9.7838390121575e-07, + "loss": 0.5505, + "step": 10700 + }, + { + "epoch": 0.1130996638426658, + "grad_norm": 3.1201093196868896, + "learning_rate": 9.778930030318488e-07, + "loss": 0.5829, + "step": 10800 + }, + { + "epoch": 0.11414688295232012, + "grad_norm": 2.6629204750061035, + "learning_rate": 9.773967191081875e-07, + "loss": 0.5925, + "step": 10900 + }, + { + "epoch": 0.11519410206197443, + "grad_norm": 2.593073844909668, + "learning_rate": 9.768950550377674e-07, + "loss": 0.572, + "step": 11000 + }, + { + "epoch": 0.11624132117162873, + "grad_norm": 4.5134687423706055, + "learning_rate": 9.763880164742224e-07, + "loss": 0.5106, + "step": 11100 + }, + { + "epoch": 0.11728854028128305, + "grad_norm": 3.3710708618164062, + "learning_rate": 9.758756091317557e-07, + "loss": 0.567, + "step": 11200 + }, + { + "epoch": 0.11833575939093736, + "grad_norm": 3.414686679840088, + "learning_rate": 9.753578387850754e-07, + "loss": 0.578, + "step": 11300 + }, + { + "epoch": 0.11938297850059168, + "grad_norm": 2.6787045001983643, + "learning_rate": 9.748347112693294e-07, + "loss": 0.5587, + "step": 11400 + }, + { + "epoch": 0.120430197610246, + "grad_norm": 2.505725860595703, + "learning_rate": 9.743062324800395e-07, + "loss": 0.5513, + "step": 11500 + }, + { + "epoch": 0.12147741671990031, + "grad_norm": 2.5358970165252686, + "learning_rate": 9.737724083730354e-07, + "loss": 0.5378, + "step": 11600 + }, + { + "epoch": 0.12252463582955461, + "grad_norm": 1.6748542785644531, + "learning_rate": 9.732332449643868e-07, + "loss": 0.5062, + "step": 11700 + }, + { + "epoch": 0.12357185493920893, + "grad_norm": 2.4574966430664062, + "learning_rate": 9.726887483303364e-07, + "loss": 0.5721, + "step": 11800 + }, + { + "epoch": 0.12461907404886324, + "grad_norm": 2.737337589263916, + "learning_rate": 9.721389246072307e-07, + "loss": 0.5963, + "step": 11900 + }, + { + "epoch": 0.12566629315851754, + "grad_norm": 2.453996181488037, + "learning_rate": 9.715837799914517e-07, + "loss": 0.5917, + "step": 12000 + }, + { + "epoch": 0.12671351226817187, + "grad_norm": 2.9003748893737793, + "learning_rate": 9.710233207393463e-07, + "loss": 0.5603, + "step": 12100 + }, + { + "epoch": 0.12776073137782618, + "grad_norm": 2.409175395965576, + "learning_rate": 9.704575531671562e-07, + "loss": 0.568, + "step": 12200 + }, + { + "epoch": 0.1288079504874805, + "grad_norm": 3.183899402618408, + "learning_rate": 9.698864836509463e-07, + "loss": 0.5702, + "step": 12300 + }, + { + "epoch": 0.1298551695971348, + "grad_norm": 2.7574760913848877, + "learning_rate": 9.693101186265336e-07, + "loss": 0.5394, + "step": 12400 + }, + { + "epoch": 0.1309023887067891, + "grad_norm": 2.9319100379943848, + "learning_rate": 9.687284645894139e-07, + "loss": 0.5504, + "step": 12500 + }, + { + "epoch": 0.13194960781644344, + "grad_norm": 2.8977279663085938, + "learning_rate": 9.681415280946887e-07, + "loss": 0.611, + "step": 12600 + }, + { + "epoch": 0.13299682692609774, + "grad_norm": 1.9469819068908691, + "learning_rate": 9.675493157569922e-07, + "loss": 0.5621, + "step": 12700 + }, + { + "epoch": 0.13404404603575207, + "grad_norm": 2.0829553604125977, + "learning_rate": 9.669518342504155e-07, + "loss": 0.5305, + "step": 12800 + }, + { + "epoch": 0.13509126514540637, + "grad_norm": 3.0171096324920654, + "learning_rate": 9.663490903084324e-07, + "loss": 0.5666, + "step": 12900 + }, + { + "epoch": 0.1361384842550607, + "grad_norm": 3.0453896522521973, + "learning_rate": 9.657410907238224e-07, + "loss": 0.5332, + "step": 13000 + }, + { + "epoch": 0.137185703364715, + "grad_norm": 2.2059998512268066, + "learning_rate": 9.651278423485958e-07, + "loss": 0.5859, + "step": 13100 + }, + { + "epoch": 0.1382329224743693, + "grad_norm": 2.076673746109009, + "learning_rate": 9.645093520939146e-07, + "loss": 0.5048, + "step": 13200 + }, + { + "epoch": 0.13928014158402363, + "grad_norm": 1.7987829446792603, + "learning_rate": 9.638856269300163e-07, + "loss": 0.5501, + "step": 13300 + }, + { + "epoch": 0.14032736069367793, + "grad_norm": 3.1706273555755615, + "learning_rate": 9.63256673886134e-07, + "loss": 0.5389, + "step": 13400 + }, + { + "epoch": 0.14137457980333226, + "grad_norm": 2.9992752075195312, + "learning_rate": 9.626225000504177e-07, + "loss": 0.5517, + "step": 13500 + }, + { + "epoch": 0.14242179891298656, + "grad_norm": 1.2536182403564453, + "learning_rate": 9.619831125698552e-07, + "loss": 0.5304, + "step": 13600 + }, + { + "epoch": 0.14346901802264087, + "grad_norm": 2.491206645965576, + "learning_rate": 9.6133851865019e-07, + "loss": 0.5001, + "step": 13700 + }, + { + "epoch": 0.1445162371322952, + "grad_norm": 2.180227518081665, + "learning_rate": 9.606887255558417e-07, + "loss": 0.5149, + "step": 13800 + }, + { + "epoch": 0.1455634562419495, + "grad_norm": 1.546883463859558, + "learning_rate": 9.60033740609823e-07, + "loss": 0.5566, + "step": 13900 + }, + { + "epoch": 0.14661067535160383, + "grad_norm": 2.402559757232666, + "learning_rate": 9.593735711936567e-07, + "loss": 0.5343, + "step": 14000 + }, + { + "epoch": 0.14765789446125813, + "grad_norm": 4.94249153137207, + "learning_rate": 9.587082247472948e-07, + "loss": 0.516, + "step": 14100 + }, + { + "epoch": 0.14870511357091243, + "grad_norm": 1.760003924369812, + "learning_rate": 9.580377087690324e-07, + "loss": 0.5395, + "step": 14200 + }, + { + "epoch": 0.14975233268056676, + "grad_norm": 2.1215927600860596, + "learning_rate": 9.573620308154238e-07, + "loss": 0.55, + "step": 14300 + }, + { + "epoch": 0.15079955179022106, + "grad_norm": 2.929760217666626, + "learning_rate": 9.566811985011981e-07, + "loss": 0.5571, + "step": 14400 + }, + { + "epoch": 0.1518467708998754, + "grad_norm": 2.7724721431732178, + "learning_rate": 9.559952194991726e-07, + "loss": 0.5712, + "step": 14500 + }, + { + "epoch": 0.1528939900095297, + "grad_norm": 2.270812749862671, + "learning_rate": 9.55304101540166e-07, + "loss": 0.5355, + "step": 14600 + }, + { + "epoch": 0.15394120911918402, + "grad_norm": 2.3572235107421875, + "learning_rate": 9.546078524129127e-07, + "loss": 0.5595, + "step": 14700 + }, + { + "epoch": 0.15498842822883832, + "grad_norm": 1.5402534008026123, + "learning_rate": 9.539064799639735e-07, + "loss": 0.5561, + "step": 14800 + }, + { + "epoch": 0.15603564733849262, + "grad_norm": 3.2286136150360107, + "learning_rate": 9.531999920976481e-07, + "loss": 0.4951, + "step": 14900 + }, + { + "epoch": 0.15708286644814695, + "grad_norm": 1.4825396537780762, + "learning_rate": 9.524883967758858e-07, + "loss": 0.5099, + "step": 15000 + }, + { + "epoch": 0.15813008555780125, + "grad_norm": 1.649629831314087, + "learning_rate": 9.517717020181953e-07, + "loss": 0.5694, + "step": 15100 + }, + { + "epoch": 0.15917730466745558, + "grad_norm": 1.8996721506118774, + "learning_rate": 9.510499159015553e-07, + "loss": 0.5364, + "step": 15200 + }, + { + "epoch": 0.16022452377710988, + "grad_norm": 3.648730993270874, + "learning_rate": 9.50323046560322e-07, + "loss": 0.5276, + "step": 15300 + }, + { + "epoch": 0.16127174288676419, + "grad_norm": 2.633986473083496, + "learning_rate": 9.495911021861396e-07, + "loss": 0.5399, + "step": 15400 + }, + { + "epoch": 0.16231896199641851, + "grad_norm": 1.8254631757736206, + "learning_rate": 9.488540910278455e-07, + "loss": 0.5484, + "step": 15500 + }, + { + "epoch": 0.16336618110607282, + "grad_norm": 2.676395893096924, + "learning_rate": 9.481120213913794e-07, + "loss": 0.5741, + "step": 15600 + }, + { + "epoch": 0.16441340021572715, + "grad_norm": 3.6794283390045166, + "learning_rate": 9.47364901639688e-07, + "loss": 0.5481, + "step": 15700 + }, + { + "epoch": 0.16546061932538145, + "grad_norm": 1.8362795114517212, + "learning_rate": 9.466127401926326e-07, + "loss": 0.5704, + "step": 15800 + }, + { + "epoch": 0.16650783843503575, + "grad_norm": 2.256762742996216, + "learning_rate": 9.458555455268924e-07, + "loss": 0.5159, + "step": 15900 + }, + { + "epoch": 0.16755505754469008, + "grad_norm": 2.6386005878448486, + "learning_rate": 9.450933261758702e-07, + "loss": 0.4916, + "step": 16000 + }, + { + "epoch": 0.16860227665434438, + "grad_norm": 2.635512113571167, + "learning_rate": 9.443260907295955e-07, + "loss": 0.508, + "step": 16100 + }, + { + "epoch": 0.1696494957639987, + "grad_norm": 1.6727428436279297, + "learning_rate": 9.435538478346282e-07, + "loss": 0.5282, + "step": 16200 + }, + { + "epoch": 0.170696714873653, + "grad_norm": 2.1256072521209717, + "learning_rate": 9.42776606193961e-07, + "loss": 0.5878, + "step": 16300 + }, + { + "epoch": 0.17174393398330734, + "grad_norm": 2.557060956954956, + "learning_rate": 9.419943745669209e-07, + "loss": 0.5392, + "step": 16400 + }, + { + "epoch": 0.17279115309296164, + "grad_norm": 2.912794828414917, + "learning_rate": 9.412071617690713e-07, + "loss": 0.5631, + "step": 16500 + }, + { + "epoch": 0.17383837220261594, + "grad_norm": 2.380751132965088, + "learning_rate": 9.40414976672112e-07, + "loss": 0.5518, + "step": 16600 + }, + { + "epoch": 0.17488559131227027, + "grad_norm": 2.5645503997802734, + "learning_rate": 9.396178282037795e-07, + "loss": 0.5377, + "step": 16700 + }, + { + "epoch": 0.17593281042192457, + "grad_norm": 2.270052433013916, + "learning_rate": 9.388157253477459e-07, + "loss": 0.524, + "step": 16800 + }, + { + "epoch": 0.1769800295315789, + "grad_norm": 2.3046374320983887, + "learning_rate": 9.380086771435187e-07, + "loss": 0.5224, + "step": 16900 + }, + { + "epoch": 0.1780272486412332, + "grad_norm": 1.9633408784866333, + "learning_rate": 9.371966926863381e-07, + "loss": 0.5241, + "step": 17000 + }, + { + "epoch": 0.1790744677508875, + "grad_norm": 2.206256628036499, + "learning_rate": 9.363797811270743e-07, + "loss": 0.5599, + "step": 17100 + }, + { + "epoch": 0.18012168686054184, + "grad_norm": 2.883242607116699, + "learning_rate": 9.355579516721251e-07, + "loss": 0.5472, + "step": 17200 + }, + { + "epoch": 0.18116890597019614, + "grad_norm": 3.9055755138397217, + "learning_rate": 9.34731213583312e-07, + "loss": 0.5463, + "step": 17300 + }, + { + "epoch": 0.18221612507985047, + "grad_norm": 2.9254720211029053, + "learning_rate": 9.338995761777751e-07, + "loss": 0.5385, + "step": 17400 + }, + { + "epoch": 0.18326334418950477, + "grad_norm": 2.070220947265625, + "learning_rate": 9.33063048827869e-07, + "loss": 0.597, + "step": 17500 + }, + { + "epoch": 0.18431056329915907, + "grad_norm": 2.241502285003662, + "learning_rate": 9.322216409610566e-07, + "loss": 0.4954, + "step": 17600 + }, + { + "epoch": 0.1853577824088134, + "grad_norm": 2.7689974308013916, + "learning_rate": 9.313753620598035e-07, + "loss": 0.5536, + "step": 17700 + }, + { + "epoch": 0.1864050015184677, + "grad_norm": 2.5464389324188232, + "learning_rate": 9.3052422166147e-07, + "loss": 0.5342, + "step": 17800 + }, + { + "epoch": 0.18745222062812203, + "grad_norm": 1.727013111114502, + "learning_rate": 9.296682293582049e-07, + "loss": 0.5383, + "step": 17900 + }, + { + "epoch": 0.18849943973777633, + "grad_norm": 4.623219966888428, + "learning_rate": 9.288073947968364e-07, + "loss": 0.5305, + "step": 18000 + }, + { + "epoch": 0.18954665884743066, + "grad_norm": 1.5261229276657104, + "learning_rate": 9.27941727678764e-07, + "loss": 0.5235, + "step": 18100 + }, + { + "epoch": 0.19059387795708496, + "grad_norm": 1.9866268634796143, + "learning_rate": 9.270712377598491e-07, + "loss": 0.5217, + "step": 18200 + }, + { + "epoch": 0.19164109706673926, + "grad_norm": 3.0393967628479004, + "learning_rate": 9.261959348503046e-07, + "loss": 0.5241, + "step": 18300 + }, + { + "epoch": 0.1926883161763936, + "grad_norm": 2.8217124938964844, + "learning_rate": 9.253158288145848e-07, + "loss": 0.5713, + "step": 18400 + }, + { + "epoch": 0.1937355352860479, + "grad_norm": 2.327930450439453, + "learning_rate": 9.24430929571274e-07, + "loss": 0.5191, + "step": 18500 + }, + { + "epoch": 0.19478275439570222, + "grad_norm": 2.090432643890381, + "learning_rate": 9.235412470929748e-07, + "loss": 0.5285, + "step": 18600 + }, + { + "epoch": 0.19582997350535652, + "grad_norm": 2.427619457244873, + "learning_rate": 9.226467914061962e-07, + "loss": 0.5157, + "step": 18700 + }, + { + "epoch": 0.19687719261501083, + "grad_norm": 3.4102041721343994, + "learning_rate": 9.217475725912391e-07, + "loss": 0.52, + "step": 18800 + }, + { + "epoch": 0.19792441172466516, + "grad_norm": 1.7967109680175781, + "learning_rate": 9.208436007820848e-07, + "loss": 0.514, + "step": 18900 + }, + { + "epoch": 0.19897163083431946, + "grad_norm": 2.5887088775634766, + "learning_rate": 9.19934886166279e-07, + "loss": 0.4798, + "step": 19000 + }, + { + "epoch": 0.2000188499439738, + "grad_norm": 2.08363676071167, + "learning_rate": 9.190214389848181e-07, + "loss": 0.5348, + "step": 19100 + }, + { + "epoch": 0.2010660690536281, + "grad_norm": 2.4554569721221924, + "learning_rate": 9.18103269532033e-07, + "loss": 0.4976, + "step": 19200 + }, + { + "epoch": 0.20211328816328242, + "grad_norm": 2.604750633239746, + "learning_rate": 9.171803881554736e-07, + "loss": 0.5048, + "step": 19300 + }, + { + "epoch": 0.20316050727293672, + "grad_norm": 1.9831663370132446, + "learning_rate": 9.162528052557925e-07, + "loss": 0.5618, + "step": 19400 + }, + { + "epoch": 0.20420772638259102, + "grad_norm": 2.6448137760162354, + "learning_rate": 9.153205312866265e-07, + "loss": 0.5382, + "step": 19500 + }, + { + "epoch": 0.20525494549224535, + "grad_norm": 2.27817964553833, + "learning_rate": 9.143835767544805e-07, + "loss": 0.5189, + "step": 19600 + }, + { + "epoch": 0.20630216460189965, + "grad_norm": 1.8295369148254395, + "learning_rate": 9.134419522186075e-07, + "loss": 0.5083, + "step": 19700 + }, + { + "epoch": 0.20734938371155398, + "grad_norm": 3.7082695960998535, + "learning_rate": 9.124956682908908e-07, + "loss": 0.4839, + "step": 19800 + }, + { + "epoch": 0.20839660282120828, + "grad_norm": 2.17672061920166, + "learning_rate": 9.115447356357238e-07, + "loss": 0.5203, + "step": 19900 + }, + { + "epoch": 0.20944382193086258, + "grad_norm": 2.759127378463745, + "learning_rate": 9.105891649698898e-07, + "loss": 0.5339, + "step": 20000 + }, + { + "epoch": 0.2104910410405169, + "grad_norm": 2.4461498260498047, + "learning_rate": 9.096289670624416e-07, + "loss": 0.5536, + "step": 20100 + }, + { + "epoch": 0.21153826015017121, + "grad_norm": 2.8688385486602783, + "learning_rate": 9.086641527345796e-07, + "loss": 0.5266, + "step": 20200 + }, + { + "epoch": 0.21258547925982554, + "grad_norm": 2.589167356491089, + "learning_rate": 9.076947328595306e-07, + "loss": 0.5031, + "step": 20300 + }, + { + "epoch": 0.21363269836947985, + "grad_norm": 3.033956289291382, + "learning_rate": 9.067207183624243e-07, + "loss": 0.5288, + "step": 20400 + }, + { + "epoch": 0.21467991747913415, + "grad_norm": 2.5122592449188232, + "learning_rate": 9.057421202201714e-07, + "loss": 0.5002, + "step": 20500 + }, + { + "epoch": 0.21572713658878848, + "grad_norm": 2.099766731262207, + "learning_rate": 9.047589494613381e-07, + "loss": 0.5389, + "step": 20600 + }, + { + "epoch": 0.21677435569844278, + "grad_norm": 2.65134596824646, + "learning_rate": 9.037712171660241e-07, + "loss": 0.5537, + "step": 20700 + }, + { + "epoch": 0.2178215748080971, + "grad_norm": 2.301417589187622, + "learning_rate": 9.027789344657357e-07, + "loss": 0.5554, + "step": 20800 + }, + { + "epoch": 0.2188687939177514, + "grad_norm": 2.6696295738220215, + "learning_rate": 9.017821125432612e-07, + "loss": 0.5191, + "step": 20900 + }, + { + "epoch": 0.21991601302740574, + "grad_norm": 2.455559015274048, + "learning_rate": 9.007807626325455e-07, + "loss": 0.5053, + "step": 21000 + }, + { + "epoch": 0.22096323213706004, + "grad_norm": 2.676161289215088, + "learning_rate": 8.997748960185622e-07, + "loss": 0.518, + "step": 21100 + }, + { + "epoch": 0.22201045124671434, + "grad_norm": 2.6200263500213623, + "learning_rate": 8.987645240371873e-07, + "loss": 0.4884, + "step": 21200 + }, + { + "epoch": 0.22305767035636867, + "grad_norm": 3.8255863189697266, + "learning_rate": 8.977496580750712e-07, + "loss": 0.5348, + "step": 21300 + }, + { + "epoch": 0.22410488946602297, + "grad_norm": 2.0892577171325684, + "learning_rate": 8.967303095695105e-07, + "loss": 0.5178, + "step": 21400 + }, + { + "epoch": 0.2251521085756773, + "grad_norm": 2.40419864654541, + "learning_rate": 8.957064900083187e-07, + "loss": 0.584, + "step": 21500 + }, + { + "epoch": 0.2261993276853316, + "grad_norm": 3.042703628540039, + "learning_rate": 8.946782109296973e-07, + "loss": 0.5267, + "step": 21600 + }, + { + "epoch": 0.2272465467949859, + "grad_norm": 1.6234790086746216, + "learning_rate": 8.936454839221054e-07, + "loss": 0.5217, + "step": 21700 + }, + { + "epoch": 0.22829376590464023, + "grad_norm": 1.706650972366333, + "learning_rate": 8.926083206241291e-07, + "loss": 0.5242, + "step": 21800 + }, + { + "epoch": 0.22934098501429453, + "grad_norm": 4.158198833465576, + "learning_rate": 8.915667327243506e-07, + "loss": 0.524, + "step": 21900 + }, + { + "epoch": 0.23038820412394886, + "grad_norm": 2.2484548091888428, + "learning_rate": 8.905207319612163e-07, + "loss": 0.5347, + "step": 22000 + }, + { + "epoch": 0.23143542323360317, + "grad_norm": 2.990169048309326, + "learning_rate": 8.894703301229043e-07, + "loss": 0.5408, + "step": 22100 + }, + { + "epoch": 0.23248264234325747, + "grad_norm": 3.9766592979431152, + "learning_rate": 8.884155390471919e-07, + "loss": 0.5046, + "step": 22200 + }, + { + "epoch": 0.2335298614529118, + "grad_norm": 2.5463485717773438, + "learning_rate": 8.873563706213221e-07, + "loss": 0.4881, + "step": 22300 + }, + { + "epoch": 0.2345770805625661, + "grad_norm": 2.7277047634124756, + "learning_rate": 8.862928367818696e-07, + "loss": 0.5228, + "step": 22400 + }, + { + "epoch": 0.23562429967222043, + "grad_norm": 1.9528217315673828, + "learning_rate": 8.852249495146063e-07, + "loss": 0.5056, + "step": 22500 + }, + { + "epoch": 0.23667151878187473, + "grad_norm": 2.527414083480835, + "learning_rate": 8.841527208543658e-07, + "loss": 0.5186, + "step": 22600 + }, + { + "epoch": 0.23771873789152906, + "grad_norm": 1.9525986909866333, + "learning_rate": 8.830761628849087e-07, + "loss": 0.5195, + "step": 22700 + }, + { + "epoch": 0.23876595700118336, + "grad_norm": 1.6230095624923706, + "learning_rate": 8.819952877387855e-07, + "loss": 0.4834, + "step": 22800 + }, + { + "epoch": 0.23981317611083766, + "grad_norm": 2.2290198802948, + "learning_rate": 8.809101075972005e-07, + "loss": 0.5207, + "step": 22900 + }, + { + "epoch": 0.240860395220492, + "grad_norm": 3.419203996658325, + "learning_rate": 8.798206346898743e-07, + "loss": 0.5064, + "step": 23000 + }, + { + "epoch": 0.2419076143301463, + "grad_norm": 2.360508441925049, + "learning_rate": 8.787268812949054e-07, + "loss": 0.5011, + "step": 23100 + }, + { + "epoch": 0.24295483343980062, + "grad_norm": 1.8023535013198853, + "learning_rate": 8.77628859738633e-07, + "loss": 0.5099, + "step": 23200 + }, + { + "epoch": 0.24400205254945492, + "grad_norm": 1.9575679302215576, + "learning_rate": 8.765265823954972e-07, + "loss": 0.5361, + "step": 23300 + }, + { + "epoch": 0.24504927165910922, + "grad_norm": 1.5841313600540161, + "learning_rate": 8.754200616879001e-07, + "loss": 0.541, + "step": 23400 + }, + { + "epoch": 0.24609649076876355, + "grad_norm": 2.8605728149414062, + "learning_rate": 8.743093100860648e-07, + "loss": 0.5541, + "step": 23500 + }, + { + "epoch": 0.24714370987841786, + "grad_norm": 1.696733832359314, + "learning_rate": 8.731943401078961e-07, + "loss": 0.511, + "step": 23600 + }, + { + "epoch": 0.24819092898807218, + "grad_norm": 2.1618356704711914, + "learning_rate": 8.720751643188389e-07, + "loss": 0.5066, + "step": 23700 + }, + { + "epoch": 0.2492381480977265, + "grad_norm": 2.721067428588867, + "learning_rate": 8.709517953317365e-07, + "loss": 0.5398, + "step": 23800 + }, + { + "epoch": 0.2502853672073808, + "grad_norm": 1.8457568883895874, + "learning_rate": 8.698242458066882e-07, + "loss": 0.4879, + "step": 23900 + }, + { + "epoch": 0.2513325863170351, + "grad_norm": 2.435941696166992, + "learning_rate": 8.686925284509077e-07, + "loss": 0.531, + "step": 24000 + }, + { + "epoch": 0.2523798054266894, + "grad_norm": 2.617920160293579, + "learning_rate": 8.675566560185786e-07, + "loss": 0.5189, + "step": 24100 + }, + { + "epoch": 0.25342702453634375, + "grad_norm": 2.538632869720459, + "learning_rate": 8.664166413107109e-07, + "loss": 0.5433, + "step": 24200 + }, + { + "epoch": 0.2544742436459981, + "grad_norm": 2.3944451808929443, + "learning_rate": 8.65272497174998e-07, + "loss": 0.5401, + "step": 24300 + }, + { + "epoch": 0.25552146275565235, + "grad_norm": 3.6203765869140625, + "learning_rate": 8.641242365056705e-07, + "loss": 0.544, + "step": 24400 + }, + { + "epoch": 0.2565686818653067, + "grad_norm": 2.866250991821289, + "learning_rate": 8.629718722433507e-07, + "loss": 0.5357, + "step": 24500 + }, + { + "epoch": 0.257615900974961, + "grad_norm": 3.3872838020324707, + "learning_rate": 8.618154173749088e-07, + "loss": 0.5261, + "step": 24600 + }, + { + "epoch": 0.2586631200846153, + "grad_norm": 2.269967794418335, + "learning_rate": 8.606548849333138e-07, + "loss": 0.5128, + "step": 24700 + }, + { + "epoch": 0.2597103391942696, + "grad_norm": 2.1335697174072266, + "learning_rate": 8.594902879974888e-07, + "loss": 0.5645, + "step": 24800 + }, + { + "epoch": 0.26075755830392394, + "grad_norm": 2.443239212036133, + "learning_rate": 8.583216396921624e-07, + "loss": 0.4806, + "step": 24900 + }, + { + "epoch": 0.2618047774135782, + "grad_norm": 2.713833808898926, + "learning_rate": 8.571489531877214e-07, + "loss": 0.5271, + "step": 25000 + }, + { + "epoch": 0.26285199652323255, + "grad_norm": 3.485182046890259, + "learning_rate": 8.559722417000619e-07, + "loss": 0.4962, + "step": 25100 + }, + { + "epoch": 0.2638992156328869, + "grad_norm": 2.306403160095215, + "learning_rate": 8.547915184904409e-07, + "loss": 0.5122, + "step": 25200 + }, + { + "epoch": 0.2649464347425412, + "grad_norm": 2.6151928901672363, + "learning_rate": 8.536067968653261e-07, + "loss": 0.5316, + "step": 25300 + }, + { + "epoch": 0.2659936538521955, + "grad_norm": 2.3466389179229736, + "learning_rate": 8.524180901762469e-07, + "loss": 0.4991, + "step": 25400 + }, + { + "epoch": 0.2670408729618498, + "grad_norm": 2.0926601886749268, + "learning_rate": 8.512254118196429e-07, + "loss": 0.5254, + "step": 25500 + }, + { + "epoch": 0.26808809207150414, + "grad_norm": 1.9708478450775146, + "learning_rate": 8.500287752367142e-07, + "loss": 0.507, + "step": 25600 + }, + { + "epoch": 0.2691353111811584, + "grad_norm": 2.028843879699707, + "learning_rate": 8.48828193913268e-07, + "loss": 0.5066, + "step": 25700 + }, + { + "epoch": 0.27018253029081274, + "grad_norm": 2.9337289333343506, + "learning_rate": 8.47623681379569e-07, + "loss": 0.5023, + "step": 25800 + }, + { + "epoch": 0.27122974940046707, + "grad_norm": 2.8608200550079346, + "learning_rate": 8.464152512101848e-07, + "loss": 0.5417, + "step": 25900 + }, + { + "epoch": 0.2722769685101214, + "grad_norm": 3.0925405025482178, + "learning_rate": 8.452029170238344e-07, + "loss": 0.5415, + "step": 26000 + }, + { + "epoch": 0.27332418761977567, + "grad_norm": 1.9558321237564087, + "learning_rate": 8.439866924832338e-07, + "loss": 0.519, + "step": 26100 + }, + { + "epoch": 0.27437140672943, + "grad_norm": 1.5545213222503662, + "learning_rate": 8.427665912949425e-07, + "loss": 0.5441, + "step": 26200 + }, + { + "epoch": 0.27541862583908433, + "grad_norm": 3.6202712059020996, + "learning_rate": 8.415426272092089e-07, + "loss": 0.5559, + "step": 26300 + }, + { + "epoch": 0.2764658449487386, + "grad_norm": 1.8004056215286255, + "learning_rate": 8.403148140198151e-07, + "loss": 0.5034, + "step": 26400 + }, + { + "epoch": 0.27751306405839293, + "grad_norm": 2.5597338676452637, + "learning_rate": 8.390831655639223e-07, + "loss": 0.5294, + "step": 26500 + }, + { + "epoch": 0.27856028316804726, + "grad_norm": 2.014400005340576, + "learning_rate": 8.378476957219134e-07, + "loss": 0.5663, + "step": 26600 + }, + { + "epoch": 0.27960750227770154, + "grad_norm": 2.069840669631958, + "learning_rate": 8.366084184172377e-07, + "loss": 0.5007, + "step": 26700 + }, + { + "epoch": 0.28065472138735587, + "grad_norm": 5.621069431304932, + "learning_rate": 8.353653476162543e-07, + "loss": 0.5263, + "step": 26800 + }, + { + "epoch": 0.2817019404970102, + "grad_norm": 3.1065540313720703, + "learning_rate": 8.341184973280732e-07, + "loss": 0.5048, + "step": 26900 + }, + { + "epoch": 0.2827491596066645, + "grad_norm": 2.579742431640625, + "learning_rate": 8.328678816043988e-07, + "loss": 0.5272, + "step": 27000 + }, + { + "epoch": 0.2837963787163188, + "grad_norm": 2.476778030395508, + "learning_rate": 8.31613514539371e-07, + "loss": 0.4944, + "step": 27100 + }, + { + "epoch": 0.2848435978259731, + "grad_norm": 2.7026314735412598, + "learning_rate": 8.303554102694065e-07, + "loss": 0.5257, + "step": 27200 + }, + { + "epoch": 0.28589081693562746, + "grad_norm": 2.1597368717193604, + "learning_rate": 8.290935829730391e-07, + "loss": 0.5282, + "step": 27300 + }, + { + "epoch": 0.28693803604528173, + "grad_norm": 2.447305202484131, + "learning_rate": 8.278280468707606e-07, + "loss": 0.5295, + "step": 27400 + }, + { + "epoch": 0.28798525515493606, + "grad_norm": 2.806995391845703, + "learning_rate": 8.265588162248597e-07, + "loss": 0.4933, + "step": 27500 + }, + { + "epoch": 0.2890324742645904, + "grad_norm": 2.1765849590301514, + "learning_rate": 8.252859053392622e-07, + "loss": 0.5486, + "step": 27600 + }, + { + "epoch": 0.2900796933742447, + "grad_norm": 2.122382640838623, + "learning_rate": 8.240093285593692e-07, + "loss": 0.5255, + "step": 27700 + }, + { + "epoch": 0.291126912483899, + "grad_norm": 2.136657476425171, + "learning_rate": 8.22729100271895e-07, + "loss": 0.5214, + "step": 27800 + }, + { + "epoch": 0.2921741315935533, + "grad_norm": 2.033987522125244, + "learning_rate": 8.214452349047065e-07, + "loss": 0.5065, + "step": 27900 + }, + { + "epoch": 0.29322135070320765, + "grad_norm": 3.346703290939331, + "learning_rate": 8.20157746926659e-07, + "loss": 0.5349, + "step": 28000 + }, + { + "epoch": 0.2942685698128619, + "grad_norm": 2.63242244720459, + "learning_rate": 8.188666508474335e-07, + "loss": 0.5264, + "step": 28100 + }, + { + "epoch": 0.29531578892251625, + "grad_norm": 2.475911855697632, + "learning_rate": 8.175719612173741e-07, + "loss": 0.5186, + "step": 28200 + }, + { + "epoch": 0.2963630080321706, + "grad_norm": 1.5967457294464111, + "learning_rate": 8.162736926273231e-07, + "loss": 0.5321, + "step": 28300 + }, + { + "epoch": 0.29741022714182486, + "grad_norm": 1.6950793266296387, + "learning_rate": 8.149718597084565e-07, + "loss": 0.5028, + "step": 28400 + }, + { + "epoch": 0.2984574462514792, + "grad_norm": 1.8821123838424683, + "learning_rate": 8.136664771321198e-07, + "loss": 0.5147, + "step": 28500 + }, + { + "epoch": 0.2995046653611335, + "grad_norm": 3.8432750701904297, + "learning_rate": 8.123575596096624e-07, + "loss": 0.5055, + "step": 28600 + }, + { + "epoch": 0.30055188447078784, + "grad_norm": 2.2065136432647705, + "learning_rate": 8.110451218922711e-07, + "loss": 0.4804, + "step": 28700 + }, + { + "epoch": 0.3015991035804421, + "grad_norm": 3.215104103088379, + "learning_rate": 8.097291787708052e-07, + "loss": 0.508, + "step": 28800 + }, + { + "epoch": 0.30264632269009645, + "grad_norm": 2.6659111976623535, + "learning_rate": 8.084097450756286e-07, + "loss": 0.5058, + "step": 28900 + }, + { + "epoch": 0.3036935417997508, + "grad_norm": 3.1594624519348145, + "learning_rate": 8.070868356764431e-07, + "loss": 0.4819, + "step": 29000 + }, + { + "epoch": 0.30474076090940505, + "grad_norm": 3.2502479553222656, + "learning_rate": 8.05760465482121e-07, + "loss": 0.5132, + "step": 29100 + }, + { + "epoch": 0.3057879800190594, + "grad_norm": 2.3569111824035645, + "learning_rate": 8.044306494405372e-07, + "loss": 0.4989, + "step": 29200 + }, + { + "epoch": 0.3068351991287137, + "grad_norm": 2.7516555786132812, + "learning_rate": 8.030974025384e-07, + "loss": 0.4982, + "step": 29300 + }, + { + "epoch": 0.30788241823836804, + "grad_norm": 2.388401508331299, + "learning_rate": 8.017607398010829e-07, + "loss": 0.492, + "step": 29400 + }, + { + "epoch": 0.3089296373480223, + "grad_norm": 2.49920392036438, + "learning_rate": 8.004206762924548e-07, + "loss": 0.4729, + "step": 29500 + }, + { + "epoch": 0.30997685645767664, + "grad_norm": 2.528714179992676, + "learning_rate": 7.99077227114711e-07, + "loss": 0.5229, + "step": 29600 + }, + { + "epoch": 0.31102407556733097, + "grad_norm": 2.0866329669952393, + "learning_rate": 7.977304074082021e-07, + "loss": 0.483, + "step": 29700 + }, + { + "epoch": 0.31207129467698524, + "grad_norm": 3.1670796871185303, + "learning_rate": 7.963802323512638e-07, + "loss": 0.4816, + "step": 29800 + }, + { + "epoch": 0.3131185137866396, + "grad_norm": 1.9715406894683838, + "learning_rate": 7.950267171600458e-07, + "loss": 0.4666, + "step": 29900 + }, + { + "epoch": 0.3141657328962939, + "grad_norm": 1.6176679134368896, + "learning_rate": 7.936698770883404e-07, + "loss": 0.4886, + "step": 30000 + }, + { + "epoch": 0.3152129520059482, + "grad_norm": 2.4239096641540527, + "learning_rate": 7.923097274274103e-07, + "loss": 0.5085, + "step": 30100 + }, + { + "epoch": 0.3162601711156025, + "grad_norm": 1.8292428255081177, + "learning_rate": 7.909462835058169e-07, + "loss": 0.538, + "step": 30200 + }, + { + "epoch": 0.31730739022525684, + "grad_norm": 2.2372076511383057, + "learning_rate": 7.895795606892466e-07, + "loss": 0.5099, + "step": 30300 + }, + { + "epoch": 0.31835460933491116, + "grad_norm": 1.9392811059951782, + "learning_rate": 7.882095743803386e-07, + "loss": 0.4947, + "step": 30400 + }, + { + "epoch": 0.31940182844456544, + "grad_norm": 2.645183801651001, + "learning_rate": 7.868363400185106e-07, + "loss": 0.5012, + "step": 30500 + }, + { + "epoch": 0.32044904755421977, + "grad_norm": 3.2452821731567383, + "learning_rate": 7.85459873079785e-07, + "loss": 0.4696, + "step": 30600 + }, + { + "epoch": 0.3214962666638741, + "grad_norm": 1.310027003288269, + "learning_rate": 7.84080189076615e-07, + "loss": 0.5183, + "step": 30700 + }, + { + "epoch": 0.32254348577352837, + "grad_norm": 2.6369211673736572, + "learning_rate": 7.826973035577091e-07, + "loss": 0.5135, + "step": 30800 + }, + { + "epoch": 0.3235907048831827, + "grad_norm": 2.9246723651885986, + "learning_rate": 7.813112321078559e-07, + "loss": 0.527, + "step": 30900 + }, + { + "epoch": 0.32463792399283703, + "grad_norm": 3.309020519256592, + "learning_rate": 7.799219903477489e-07, + "loss": 0.5322, + "step": 31000 + }, + { + "epoch": 0.32568514310249136, + "grad_norm": 2.4480512142181396, + "learning_rate": 7.785295939338105e-07, + "loss": 0.5234, + "step": 31100 + }, + { + "epoch": 0.32673236221214563, + "grad_norm": 1.7909550666809082, + "learning_rate": 7.771340585580149e-07, + "loss": 0.4938, + "step": 31200 + }, + { + "epoch": 0.32777958132179996, + "grad_norm": 2.6975667476654053, + "learning_rate": 7.757353999477114e-07, + "loss": 0.491, + "step": 31300 + }, + { + "epoch": 0.3288268004314543, + "grad_norm": 2.4480390548706055, + "learning_rate": 7.743336338654483e-07, + "loss": 0.538, + "step": 31400 + }, + { + "epoch": 0.32987401954110857, + "grad_norm": 1.8292025327682495, + "learning_rate": 7.729287761087935e-07, + "loss": 0.4906, + "step": 31500 + }, + { + "epoch": 0.3309212386507629, + "grad_norm": 1.5502568483352661, + "learning_rate": 7.715208425101576e-07, + "loss": 0.459, + "step": 31600 + }, + { + "epoch": 0.3319684577604172, + "grad_norm": 2.6698973178863525, + "learning_rate": 7.701098489366156e-07, + "loss": 0.5086, + "step": 31700 + }, + { + "epoch": 0.3330156768700715, + "grad_norm": 2.4431324005126953, + "learning_rate": 7.686958112897271e-07, + "loss": 0.4843, + "step": 31800 + }, + { + "epoch": 0.3340628959797258, + "grad_norm": 2.875575065612793, + "learning_rate": 7.67278745505358e-07, + "loss": 0.5171, + "step": 31900 + }, + { + "epoch": 0.33511011508938016, + "grad_norm": 2.196960210800171, + "learning_rate": 7.658586675535005e-07, + "loss": 0.5026, + "step": 32000 + }, + { + "epoch": 0.3361573341990345, + "grad_norm": 2.801039457321167, + "learning_rate": 7.644355934380933e-07, + "loss": 0.5175, + "step": 32100 + }, + { + "epoch": 0.33720455330868876, + "grad_norm": 2.4252429008483887, + "learning_rate": 7.630095391968407e-07, + "loss": 0.492, + "step": 32200 + }, + { + "epoch": 0.3382517724183431, + "grad_norm": 1.9080466032028198, + "learning_rate": 7.615805209010334e-07, + "loss": 0.5203, + "step": 32300 + }, + { + "epoch": 0.3392989915279974, + "grad_norm": 1.8371050357818604, + "learning_rate": 7.601485546553647e-07, + "loss": 0.5028, + "step": 32400 + }, + { + "epoch": 0.3403462106376517, + "grad_norm": 3.5394959449768066, + "learning_rate": 7.587136565977522e-07, + "loss": 0.5203, + "step": 32500 + }, + { + "epoch": 0.341393429747306, + "grad_norm": 2.381826400756836, + "learning_rate": 7.572758428991532e-07, + "loss": 0.5254, + "step": 32600 + }, + { + "epoch": 0.34244064885696035, + "grad_norm": 1.7615987062454224, + "learning_rate": 7.55835129763384e-07, + "loss": 0.5091, + "step": 32700 + }, + { + "epoch": 0.3434878679666147, + "grad_norm": 2.329334020614624, + "learning_rate": 7.543915334269365e-07, + "loss": 0.5004, + "step": 32800 + }, + { + "epoch": 0.34453508707626895, + "grad_norm": 2.9679040908813477, + "learning_rate": 7.529450701587963e-07, + "loss": 0.5114, + "step": 32900 + }, + { + "epoch": 0.3455823061859233, + "grad_norm": 3.3162288665771484, + "learning_rate": 7.514957562602582e-07, + "loss": 0.5055, + "step": 33000 + }, + { + "epoch": 0.3466295252955776, + "grad_norm": 2.0709986686706543, + "learning_rate": 7.500436080647428e-07, + "loss": 0.5574, + "step": 33100 + }, + { + "epoch": 0.3476767444052319, + "grad_norm": 2.1400296688079834, + "learning_rate": 7.485886419376126e-07, + "loss": 0.5777, + "step": 33200 + }, + { + "epoch": 0.3487239635148862, + "grad_norm": 2.4479362964630127, + "learning_rate": 7.471308742759879e-07, + "loss": 0.5378, + "step": 33300 + }, + { + "epoch": 0.34977118262454054, + "grad_norm": 2.2012875080108643, + "learning_rate": 7.456703215085609e-07, + "loss": 0.4941, + "step": 33400 + }, + { + "epoch": 0.3508184017341948, + "grad_norm": 2.5233943462371826, + "learning_rate": 7.44207000095412e-07, + "loss": 0.547, + "step": 33500 + }, + { + "epoch": 0.35186562084384915, + "grad_norm": 2.050294876098633, + "learning_rate": 7.427409265278235e-07, + "loss": 0.5326, + "step": 33600 + }, + { + "epoch": 0.3529128399535035, + "grad_norm": 1.9416810274124146, + "learning_rate": 7.412721173280931e-07, + "loss": 0.5373, + "step": 33700 + }, + { + "epoch": 0.3539600590631578, + "grad_norm": 2.4550209045410156, + "learning_rate": 7.398005890493493e-07, + "loss": 0.5025, + "step": 33800 + }, + { + "epoch": 0.3550072781728121, + "grad_norm": 2.1860315799713135, + "learning_rate": 7.383263582753633e-07, + "loss": 0.4961, + "step": 33900 + }, + { + "epoch": 0.3560544972824664, + "grad_norm": 3.3393681049346924, + "learning_rate": 7.368494416203632e-07, + "loss": 0.5014, + "step": 34000 + }, + { + "epoch": 0.35710171639212074, + "grad_norm": 2.2855758666992188, + "learning_rate": 7.353698557288462e-07, + "loss": 0.5179, + "step": 34100 + }, + { + "epoch": 0.358148935501775, + "grad_norm": 2.719910144805908, + "learning_rate": 7.338876172753913e-07, + "loss": 0.5151, + "step": 34200 + }, + { + "epoch": 0.35919615461142934, + "grad_norm": 2.3122212886810303, + "learning_rate": 7.324027429644709e-07, + "loss": 0.5075, + "step": 34300 + }, + { + "epoch": 0.36024337372108367, + "grad_norm": 2.5901198387145996, + "learning_rate": 7.309152495302631e-07, + "loss": 0.5185, + "step": 34400 + }, + { + "epoch": 0.361290592830738, + "grad_norm": 2.749903440475464, + "learning_rate": 7.294251537364629e-07, + "loss": 0.4728, + "step": 34500 + }, + { + "epoch": 0.3623378119403923, + "grad_norm": 2.453977108001709, + "learning_rate": 7.279324723760932e-07, + "loss": 0.5197, + "step": 34600 + }, + { + "epoch": 0.3633850310500466, + "grad_norm": 3.2406835556030273, + "learning_rate": 7.264372222713157e-07, + "loss": 0.4856, + "step": 34700 + }, + { + "epoch": 0.36443225015970093, + "grad_norm": 2.1802427768707275, + "learning_rate": 7.249394202732414e-07, + "loss": 0.4996, + "step": 34800 + }, + { + "epoch": 0.3654794692693552, + "grad_norm": 1.560670256614685, + "learning_rate": 7.234390832617399e-07, + "loss": 0.5032, + "step": 34900 + }, + { + "epoch": 0.36652668837900954, + "grad_norm": 2.8153815269470215, + "learning_rate": 7.219362281452504e-07, + "loss": 0.4882, + "step": 35000 + }, + { + "epoch": 0.36757390748866386, + "grad_norm": 3.205367088317871, + "learning_rate": 7.204308718605906e-07, + "loss": 0.5232, + "step": 35100 + }, + { + "epoch": 0.36862112659831814, + "grad_norm": 1.6098523139953613, + "learning_rate": 7.189230313727651e-07, + "loss": 0.488, + "step": 35200 + }, + { + "epoch": 0.36966834570797247, + "grad_norm": 2.2674808502197266, + "learning_rate": 7.174127236747756e-07, + "loss": 0.5026, + "step": 35300 + }, + { + "epoch": 0.3707155648176268, + "grad_norm": 2.0923283100128174, + "learning_rate": 7.158999657874283e-07, + "loss": 0.5292, + "step": 35400 + }, + { + "epoch": 0.3717627839272811, + "grad_norm": 2.078521251678467, + "learning_rate": 7.143847747591423e-07, + "loss": 0.5002, + "step": 35500 + }, + { + "epoch": 0.3728100030369354, + "grad_norm": 2.299473285675049, + "learning_rate": 7.128671676657579e-07, + "loss": 0.5132, + "step": 35600 + }, + { + "epoch": 0.37385722214658973, + "grad_norm": 1.3978760242462158, + "learning_rate": 7.113471616103441e-07, + "loss": 0.5182, + "step": 35700 + }, + { + "epoch": 0.37490444125624406, + "grad_norm": 2.559293746948242, + "learning_rate": 7.098247737230052e-07, + "loss": 0.5202, + "step": 35800 + }, + { + "epoch": 0.37595166036589833, + "grad_norm": 2.457498788833618, + "learning_rate": 7.083000211606881e-07, + "loss": 0.4946, + "step": 35900 + }, + { + "epoch": 0.37699887947555266, + "grad_norm": 1.9849262237548828, + "learning_rate": 7.067729211069892e-07, + "loss": 0.4932, + "step": 36000 + }, + { + "epoch": 0.378046098585207, + "grad_norm": 2.242328405380249, + "learning_rate": 7.05243490771961e-07, + "loss": 0.4853, + "step": 36100 + }, + { + "epoch": 0.3790933176948613, + "grad_norm": 4.18756103515625, + "learning_rate": 7.037117473919169e-07, + "loss": 0.5271, + "step": 36200 + }, + { + "epoch": 0.3801405368045156, + "grad_norm": 2.454249382019043, + "learning_rate": 7.021777082292384e-07, + "loss": 0.5208, + "step": 36300 + }, + { + "epoch": 0.3811877559141699, + "grad_norm": 1.5989599227905273, + "learning_rate": 7.006413905721796e-07, + "loss": 0.5252, + "step": 36400 + }, + { + "epoch": 0.38223497502382425, + "grad_norm": 3.1384224891662598, + "learning_rate": 6.991028117346727e-07, + "loss": 0.5231, + "step": 36500 + }, + { + "epoch": 0.3832821941334785, + "grad_norm": 3.674887180328369, + "learning_rate": 6.975619890561331e-07, + "loss": 0.5338, + "step": 36600 + }, + { + "epoch": 0.38432941324313286, + "grad_norm": 2.8714184761047363, + "learning_rate": 6.960189399012635e-07, + "loss": 0.4667, + "step": 36700 + }, + { + "epoch": 0.3853766323527872, + "grad_norm": 2.0271899700164795, + "learning_rate": 6.944736816598585e-07, + "loss": 0.5439, + "step": 36800 + }, + { + "epoch": 0.38642385146244146, + "grad_norm": 2.3302154541015625, + "learning_rate": 6.929262317466087e-07, + "loss": 0.5085, + "step": 36900 + }, + { + "epoch": 0.3874710705720958, + "grad_norm": 1.89630126953125, + "learning_rate": 6.913766076009042e-07, + "loss": 0.489, + "step": 37000 + }, + { + "epoch": 0.3885182896817501, + "grad_norm": 3.864342212677002, + "learning_rate": 6.898248266866383e-07, + "loss": 0.4782, + "step": 37100 + }, + { + "epoch": 0.38956550879140445, + "grad_norm": 3.6760518550872803, + "learning_rate": 6.882709064920104e-07, + "loss": 0.5387, + "step": 37200 + }, + { + "epoch": 0.3906127279010587, + "grad_norm": 2.225639581680298, + "learning_rate": 6.867148645293292e-07, + "loss": 0.5417, + "step": 37300 + }, + { + "epoch": 0.39165994701071305, + "grad_norm": 1.6425765752792358, + "learning_rate": 6.85156718334815e-07, + "loss": 0.501, + "step": 37400 + }, + { + "epoch": 0.3927071661203674, + "grad_norm": 2.095388650894165, + "learning_rate": 6.835964854684027e-07, + "loss": 0.5244, + "step": 37500 + }, + { + "epoch": 0.39375438523002165, + "grad_norm": 1.9956177473068237, + "learning_rate": 6.820341835135434e-07, + "loss": 0.4862, + "step": 37600 + }, + { + "epoch": 0.394801604339676, + "grad_norm": 2.3689606189727783, + "learning_rate": 6.804698300770058e-07, + "loss": 0.5174, + "step": 37700 + }, + { + "epoch": 0.3958488234493303, + "grad_norm": 2.4154350757598877, + "learning_rate": 6.789034427886788e-07, + "loss": 0.5232, + "step": 37800 + }, + { + "epoch": 0.39689604255898464, + "grad_norm": 2.841860055923462, + "learning_rate": 6.773350393013725e-07, + "loss": 0.4952, + "step": 37900 + }, + { + "epoch": 0.3979432616686389, + "grad_norm": 1.6685402393341064, + "learning_rate": 6.757646372906183e-07, + "loss": 0.5136, + "step": 38000 + }, + { + "epoch": 0.39899048077829324, + "grad_norm": 2.3947384357452393, + "learning_rate": 6.741922544544716e-07, + "loss": 0.4728, + "step": 38100 + }, + { + "epoch": 0.4000376998879476, + "grad_norm": 1.9924613237380981, + "learning_rate": 6.726179085133102e-07, + "loss": 0.5101, + "step": 38200 + }, + { + "epoch": 0.40108491899760185, + "grad_norm": 2.3830676078796387, + "learning_rate": 6.710416172096361e-07, + "loss": 0.489, + "step": 38300 + }, + { + "epoch": 0.4021321381072562, + "grad_norm": 2.6001055240631104, + "learning_rate": 6.69463398307875e-07, + "loss": 0.5337, + "step": 38400 + }, + { + "epoch": 0.4031793572169105, + "grad_norm": 2.329277753829956, + "learning_rate": 6.678832695941763e-07, + "loss": 0.469, + "step": 38500 + }, + { + "epoch": 0.40422657632656483, + "grad_norm": 2.2831122875213623, + "learning_rate": 6.663012488762123e-07, + "loss": 0.5279, + "step": 38600 + }, + { + "epoch": 0.4052737954362191, + "grad_norm": 2.813821315765381, + "learning_rate": 6.647173539829778e-07, + "loss": 0.4873, + "step": 38700 + }, + { + "epoch": 0.40632101454587344, + "grad_norm": 2.3835694789886475, + "learning_rate": 6.631316027645892e-07, + "loss": 0.4991, + "step": 38800 + }, + { + "epoch": 0.40736823365552777, + "grad_norm": 2.7960257530212402, + "learning_rate": 6.615440130920833e-07, + "loss": 0.5366, + "step": 38900 + }, + { + "epoch": 0.40841545276518204, + "grad_norm": 1.9220885038375854, + "learning_rate": 6.599546028572153e-07, + "loss": 0.5111, + "step": 39000 + }, + { + "epoch": 0.40946267187483637, + "grad_norm": 2.636683464050293, + "learning_rate": 6.583633899722587e-07, + "loss": 0.5058, + "step": 39100 + }, + { + "epoch": 0.4105098909844907, + "grad_norm": 2.0583505630493164, + "learning_rate": 6.567703923698013e-07, + "loss": 0.4796, + "step": 39200 + }, + { + "epoch": 0.411557110094145, + "grad_norm": 3.092818021774292, + "learning_rate": 6.551756280025453e-07, + "loss": 0.5181, + "step": 39300 + }, + { + "epoch": 0.4126043292037993, + "grad_norm": 2.689857006072998, + "learning_rate": 6.535791148431031e-07, + "loss": 0.5424, + "step": 39400 + }, + { + "epoch": 0.41365154831345363, + "grad_norm": 1.4727122783660889, + "learning_rate": 6.519808708837958e-07, + "loss": 0.5257, + "step": 39500 + }, + { + "epoch": 0.41469876742310796, + "grad_norm": 2.4704394340515137, + "learning_rate": 6.503809141364506e-07, + "loss": 0.5043, + "step": 39600 + }, + { + "epoch": 0.41574598653276224, + "grad_norm": 2.2205686569213867, + "learning_rate": 6.487792626321969e-07, + "loss": 0.4732, + "step": 39700 + }, + { + "epoch": 0.41679320564241656, + "grad_norm": 4.539642333984375, + "learning_rate": 6.471759344212637e-07, + "loss": 0.5028, + "step": 39800 + }, + { + "epoch": 0.4178404247520709, + "grad_norm": 3.22900652885437, + "learning_rate": 6.455709475727764e-07, + "loss": 0.4802, + "step": 39900 + }, + { + "epoch": 0.41888764386172517, + "grad_norm": 1.7866666316986084, + "learning_rate": 6.439643201745524e-07, + "loss": 0.4677, + "step": 40000 + }, + { + "epoch": 0.4199348629713795, + "grad_norm": 1.5298930406570435, + "learning_rate": 6.423560703328981e-07, + "loss": 0.4663, + "step": 40100 + }, + { + "epoch": 0.4209820820810338, + "grad_norm": 2.7381436824798584, + "learning_rate": 6.407462161724042e-07, + "loss": 0.5032, + "step": 40200 + }, + { + "epoch": 0.42202930119068816, + "grad_norm": 1.915801763534546, + "learning_rate": 6.391347758357418e-07, + "loss": 0.4876, + "step": 40300 + }, + { + "epoch": 0.42307652030034243, + "grad_norm": 2.128645658493042, + "learning_rate": 6.375217674834578e-07, + "loss": 0.4947, + "step": 40400 + }, + { + "epoch": 0.42412373940999676, + "grad_norm": 2.3809661865234375, + "learning_rate": 6.359072092937702e-07, + "loss": 0.5207, + "step": 40500 + }, + { + "epoch": 0.4251709585196511, + "grad_norm": 2.089869976043701, + "learning_rate": 6.342911194623636e-07, + "loss": 0.5179, + "step": 40600 + }, + { + "epoch": 0.42621817762930536, + "grad_norm": 2.531280040740967, + "learning_rate": 6.326735162021832e-07, + "loss": 0.5003, + "step": 40700 + }, + { + "epoch": 0.4272653967389597, + "grad_norm": 1.5095371007919312, + "learning_rate": 6.310544177432308e-07, + "loss": 0.475, + "step": 40800 + }, + { + "epoch": 0.428312615848614, + "grad_norm": 3.487618923187256, + "learning_rate": 6.294338423323584e-07, + "loss": 0.5382, + "step": 40900 + }, + { + "epoch": 0.4293598349582683, + "grad_norm": 3.1474342346191406, + "learning_rate": 6.27811808233063e-07, + "loss": 0.5147, + "step": 41000 + }, + { + "epoch": 0.4304070540679226, + "grad_norm": 3.5564653873443604, + "learning_rate": 6.261883337252808e-07, + "loss": 0.5062, + "step": 41100 + }, + { + "epoch": 0.43145427317757695, + "grad_norm": 2.47421932220459, + "learning_rate": 6.245634371051808e-07, + "loss": 0.5364, + "step": 41200 + }, + { + "epoch": 0.4325014922872313, + "grad_norm": 1.5858722925186157, + "learning_rate": 6.22937136684959e-07, + "loss": 0.5319, + "step": 41300 + }, + { + "epoch": 0.43354871139688556, + "grad_norm": 2.9193403720855713, + "learning_rate": 6.21309450792632e-07, + "loss": 0.486, + "step": 41400 + }, + { + "epoch": 0.4345959305065399, + "grad_norm": 1.9017012119293213, + "learning_rate": 6.1968039777183e-07, + "loss": 0.5445, + "step": 41500 + }, + { + "epoch": 0.4356431496161942, + "grad_norm": 2.5207788944244385, + "learning_rate": 6.180499959815908e-07, + "loss": 0.5274, + "step": 41600 + }, + { + "epoch": 0.4366903687258485, + "grad_norm": 2.239696979522705, + "learning_rate": 6.164182637961521e-07, + "loss": 0.5056, + "step": 41700 + }, + { + "epoch": 0.4377375878355028, + "grad_norm": 2.565997838973999, + "learning_rate": 6.147852196047455e-07, + "loss": 0.508, + "step": 41800 + }, + { + "epoch": 0.43878480694515715, + "grad_norm": 1.4207922220230103, + "learning_rate": 6.131508818113878e-07, + "loss": 0.4964, + "step": 41900 + }, + { + "epoch": 0.4398320260548115, + "grad_norm": 2.6042516231536865, + "learning_rate": 6.11515268834675e-07, + "loss": 0.5008, + "step": 42000 + }, + { + "epoch": 0.44087924516446575, + "grad_norm": 2.077496290206909, + "learning_rate": 6.098783991075736e-07, + "loss": 0.4964, + "step": 42100 + }, + { + "epoch": 0.4419264642741201, + "grad_norm": 2.444882392883301, + "learning_rate": 6.082402910772137e-07, + "loss": 0.493, + "step": 42200 + }, + { + "epoch": 0.4429736833837744, + "grad_norm": 3.973526954650879, + "learning_rate": 6.066009632046809e-07, + "loss": 0.5078, + "step": 42300 + }, + { + "epoch": 0.4440209024934287, + "grad_norm": 2.283217430114746, + "learning_rate": 6.049604339648078e-07, + "loss": 0.4756, + "step": 42400 + }, + { + "epoch": 0.445068121603083, + "grad_norm": 1.3749598264694214, + "learning_rate": 6.033187218459665e-07, + "loss": 0.494, + "step": 42500 + }, + { + "epoch": 0.44611534071273734, + "grad_norm": 3.739201068878174, + "learning_rate": 6.016758453498592e-07, + "loss": 0.4977, + "step": 42600 + }, + { + "epoch": 0.4471625598223916, + "grad_norm": 2.5676069259643555, + "learning_rate": 6.00031822991311e-07, + "loss": 0.4691, + "step": 42700 + }, + { + "epoch": 0.44820977893204594, + "grad_norm": 2.269869089126587, + "learning_rate": 5.983866732980607e-07, + "loss": 0.5088, + "step": 42800 + }, + { + "epoch": 0.4492569980417003, + "grad_norm": 1.8404080867767334, + "learning_rate": 5.96740414810551e-07, + "loss": 0.4666, + "step": 42900 + }, + { + "epoch": 0.4503042171513546, + "grad_norm": 2.3597822189331055, + "learning_rate": 5.950930660817214e-07, + "loss": 0.4976, + "step": 43000 + }, + { + "epoch": 0.4513514362610089, + "grad_norm": 1.5849223136901855, + "learning_rate": 5.934446456767977e-07, + "loss": 0.5176, + "step": 43100 + }, + { + "epoch": 0.4523986553706632, + "grad_norm": 1.3389567136764526, + "learning_rate": 5.917951721730834e-07, + "loss": 0.5244, + "step": 43200 + }, + { + "epoch": 0.45344587448031753, + "grad_norm": 2.6399717330932617, + "learning_rate": 5.901446641597498e-07, + "loss": 0.5227, + "step": 43300 + }, + { + "epoch": 0.4544930935899718, + "grad_norm": 2.2782344818115234, + "learning_rate": 5.884931402376274e-07, + "loss": 0.5351, + "step": 43400 + }, + { + "epoch": 0.45554031269962614, + "grad_norm": 4.411149024963379, + "learning_rate": 5.868406190189955e-07, + "loss": 0.4855, + "step": 43500 + }, + { + "epoch": 0.45658753180928047, + "grad_norm": 2.243643045425415, + "learning_rate": 5.851871191273726e-07, + "loss": 0.5299, + "step": 43600 + }, + { + "epoch": 0.4576347509189348, + "grad_norm": 2.678518533706665, + "learning_rate": 5.835326591973068e-07, + "loss": 0.5615, + "step": 43700 + }, + { + "epoch": 0.45868197002858907, + "grad_norm": 2.2850341796875, + "learning_rate": 5.818772578741654e-07, + "loss": 0.5314, + "step": 43800 + }, + { + "epoch": 0.4597291891382434, + "grad_norm": 2.199620246887207, + "learning_rate": 5.802209338139253e-07, + "loss": 0.4905, + "step": 43900 + }, + { + "epoch": 0.46077640824789773, + "grad_norm": 2.532054901123047, + "learning_rate": 5.785637056829619e-07, + "loss": 0.5143, + "step": 44000 + }, + { + "epoch": 0.461823627357552, + "grad_norm": 1.9873905181884766, + "learning_rate": 5.769055921578399e-07, + "loss": 0.5128, + "step": 44100 + }, + { + "epoch": 0.46287084646720633, + "grad_norm": 2.033123254776001, + "learning_rate": 5.752466119251018e-07, + "loss": 0.5027, + "step": 44200 + }, + { + "epoch": 0.46391806557686066, + "grad_norm": 1.890243649482727, + "learning_rate": 5.735867836810575e-07, + "loss": 0.4893, + "step": 44300 + }, + { + "epoch": 0.46496528468651493, + "grad_norm": 2.7789084911346436, + "learning_rate": 5.719261261315742e-07, + "loss": 0.4804, + "step": 44400 + }, + { + "epoch": 0.46601250379616926, + "grad_norm": 2.320241928100586, + "learning_rate": 5.702646579918651e-07, + "loss": 0.4727, + "step": 44500 + }, + { + "epoch": 0.4670597229058236, + "grad_norm": 2.557783603668213, + "learning_rate": 5.686023979862784e-07, + "loss": 0.4802, + "step": 44600 + }, + { + "epoch": 0.4681069420154779, + "grad_norm": 2.0354034900665283, + "learning_rate": 5.669393648480861e-07, + "loss": 0.4409, + "step": 44700 + }, + { + "epoch": 0.4691541611251322, + "grad_norm": 2.6490516662597656, + "learning_rate": 5.652755773192742e-07, + "loss": 0.5116, + "step": 44800 + }, + { + "epoch": 0.4702013802347865, + "grad_norm": 1.9367735385894775, + "learning_rate": 5.636110541503299e-07, + "loss": 0.51, + "step": 44900 + }, + { + "epoch": 0.47124859934444085, + "grad_norm": 2.3540682792663574, + "learning_rate": 5.619458141000305e-07, + "loss": 0.5053, + "step": 45000 + }, + { + "epoch": 0.47229581845409513, + "grad_norm": 2.308772325515747, + "learning_rate": 5.602798759352328e-07, + "loss": 0.4857, + "step": 45100 + }, + { + "epoch": 0.47334303756374946, + "grad_norm": 2.775662899017334, + "learning_rate": 5.586132584306617e-07, + "loss": 0.5039, + "step": 45200 + }, + { + "epoch": 0.4743902566734038, + "grad_norm": 2.4968132972717285, + "learning_rate": 5.569459803686971e-07, + "loss": 0.5047, + "step": 45300 + }, + { + "epoch": 0.4754374757830581, + "grad_norm": 2.3723912239074707, + "learning_rate": 5.552780605391637e-07, + "loss": 0.5022, + "step": 45400 + }, + { + "epoch": 0.4764846948927124, + "grad_norm": 2.080238103866577, + "learning_rate": 5.53609517739119e-07, + "loss": 0.5139, + "step": 45500 + }, + { + "epoch": 0.4775319140023667, + "grad_norm": 2.763566732406616, + "learning_rate": 5.519403707726409e-07, + "loss": 0.5269, + "step": 45600 + }, + { + "epoch": 0.47857913311202105, + "grad_norm": 2.2503960132598877, + "learning_rate": 5.502706384506162e-07, + "loss": 0.5049, + "step": 45700 + }, + { + "epoch": 0.4796263522216753, + "grad_norm": 2.2146077156066895, + "learning_rate": 5.486003395905284e-07, + "loss": 0.5164, + "step": 45800 + }, + { + "epoch": 0.48067357133132965, + "grad_norm": 2.077916145324707, + "learning_rate": 5.46929493016246e-07, + "loss": 0.5436, + "step": 45900 + }, + { + "epoch": 0.481720790440984, + "grad_norm": 2.990812301635742, + "learning_rate": 5.452581175578099e-07, + "loss": 0.4996, + "step": 46000 + }, + { + "epoch": 0.48276800955063826, + "grad_norm": 2.3420207500457764, + "learning_rate": 5.435862320512216e-07, + "loss": 0.4886, + "step": 46100 + }, + { + "epoch": 0.4838152286602926, + "grad_norm": 2.182870864868164, + "learning_rate": 5.419138553382303e-07, + "loss": 0.5081, + "step": 46200 + }, + { + "epoch": 0.4848624477699469, + "grad_norm": 2.5916247367858887, + "learning_rate": 5.402410062661217e-07, + "loss": 0.4863, + "step": 46300 + }, + { + "epoch": 0.48590966687960124, + "grad_norm": 2.3160765171051025, + "learning_rate": 5.38567703687504e-07, + "loss": 0.55, + "step": 46400 + }, + { + "epoch": 0.4869568859892555, + "grad_norm": 3.3683152198791504, + "learning_rate": 5.368939664600971e-07, + "loss": 0.4838, + "step": 46500 + }, + { + "epoch": 0.48800410509890985, + "grad_norm": 1.8857132196426392, + "learning_rate": 5.352198134465188e-07, + "loss": 0.5053, + "step": 46600 + }, + { + "epoch": 0.4890513242085642, + "grad_norm": 2.4393274784088135, + "learning_rate": 5.335452635140728e-07, + "loss": 0.53, + "step": 46700 + }, + { + "epoch": 0.49009854331821845, + "grad_norm": 2.8095269203186035, + "learning_rate": 5.318703355345361e-07, + "loss": 0.4955, + "step": 46800 + }, + { + "epoch": 0.4911457624278728, + "grad_norm": 3.766524076461792, + "learning_rate": 5.301950483839461e-07, + "loss": 0.5033, + "step": 46900 + }, + { + "epoch": 0.4921929815375271, + "grad_norm": 3.614816665649414, + "learning_rate": 5.285194209423881e-07, + "loss": 0.516, + "step": 47000 + }, + { + "epoch": 0.49324020064718144, + "grad_norm": 2.2229409217834473, + "learning_rate": 5.268434720937823e-07, + "loss": 0.5158, + "step": 47100 + }, + { + "epoch": 0.4942874197568357, + "grad_norm": 2.4111645221710205, + "learning_rate": 5.251672207256708e-07, + "loss": 0.5265, + "step": 47200 + }, + { + "epoch": 0.49533463886649004, + "grad_norm": 1.9818792343139648, + "learning_rate": 5.234906857290057e-07, + "loss": 0.5059, + "step": 47300 + }, + { + "epoch": 0.49638185797614437, + "grad_norm": 1.8921643495559692, + "learning_rate": 5.218138859979349e-07, + "loss": 0.5281, + "step": 47400 + }, + { + "epoch": 0.49742907708579864, + "grad_norm": 2.3685996532440186, + "learning_rate": 5.201368404295899e-07, + "loss": 0.5257, + "step": 47500 + }, + { + "epoch": 0.498476296195453, + "grad_norm": 3.2099828720092773, + "learning_rate": 5.184595679238732e-07, + "loss": 0.4806, + "step": 47600 + }, + { + "epoch": 0.4995235153051073, + "grad_norm": 2.328226089477539, + "learning_rate": 5.167820873832445e-07, + "loss": 0.5496, + "step": 47700 + }, + { + "epoch": 0.5005707344147616, + "grad_norm": 2.010138988494873, + "learning_rate": 5.151044177125077e-07, + "loss": 0.5025, + "step": 47800 + }, + { + "epoch": 0.501617953524416, + "grad_norm": 2.0107200145721436, + "learning_rate": 5.134265778185984e-07, + "loss": 0.4695, + "step": 47900 + }, + { + "epoch": 0.5026651726340702, + "grad_norm": 3.73002552986145, + "learning_rate": 5.117485866103707e-07, + "loss": 0.5489, + "step": 48000 + }, + { + "epoch": 0.5037123917437245, + "grad_norm": 1.203131914138794, + "learning_rate": 5.100704629983842e-07, + "loss": 0.4918, + "step": 48100 + }, + { + "epoch": 0.5047596108533788, + "grad_norm": 2.464951276779175, + "learning_rate": 5.083922258946899e-07, + "loss": 0.526, + "step": 48200 + }, + { + "epoch": 0.5058068299630332, + "grad_norm": 2.5923502445220947, + "learning_rate": 5.067138942126185e-07, + "loss": 0.5094, + "step": 48300 + }, + { + "epoch": 0.5068540490726875, + "grad_norm": 2.553731918334961, + "learning_rate": 5.050354868665663e-07, + "loss": 0.5116, + "step": 48400 + }, + { + "epoch": 0.5079012681823418, + "grad_norm": 2.171161413192749, + "learning_rate": 5.033570227717823e-07, + "loss": 0.5021, + "step": 48500 + }, + { + "epoch": 0.5089484872919962, + "grad_norm": 1.9675207138061523, + "learning_rate": 5.016785208441553e-07, + "loss": 0.4759, + "step": 48600 + }, + { + "epoch": 0.5099957064016504, + "grad_norm": 2.772975206375122, + "learning_rate": 5e-07, + "loss": 0.504, + "step": 48700 + }, + { + "epoch": 0.5110429255113047, + "grad_norm": 1.8081309795379639, + "learning_rate": 4.983214791558449e-07, + "loss": 0.4884, + "step": 48800 + }, + { + "epoch": 0.512090144620959, + "grad_norm": 2.1011574268341064, + "learning_rate": 4.966429772282177e-07, + "loss": 0.5411, + "step": 48900 + }, + { + "epoch": 0.5131373637306134, + "grad_norm": 1.7532665729522705, + "learning_rate": 4.949645131334338e-07, + "loss": 0.5217, + "step": 49000 + }, + { + "epoch": 0.5141845828402677, + "grad_norm": 1.9248243570327759, + "learning_rate": 4.932861057873817e-07, + "loss": 0.5161, + "step": 49100 + }, + { + "epoch": 0.515231801949922, + "grad_norm": 2.180882692337036, + "learning_rate": 4.916077741053101e-07, + "loss": 0.4977, + "step": 49200 + }, + { + "epoch": 0.5162790210595763, + "grad_norm": 2.663121223449707, + "learning_rate": 4.899295370016159e-07, + "loss": 0.4918, + "step": 49300 + }, + { + "epoch": 0.5173262401692306, + "grad_norm": 1.928085446357727, + "learning_rate": 4.882514133896293e-07, + "loss": 0.4863, + "step": 49400 + }, + { + "epoch": 0.5183734592788849, + "grad_norm": 2.9963412284851074, + "learning_rate": 4.865734221814016e-07, + "loss": 0.5015, + "step": 49500 + }, + { + "epoch": 0.5194206783885392, + "grad_norm": 2.45681095123291, + "learning_rate": 4.848955822874924e-07, + "loss": 0.5285, + "step": 49600 + }, + { + "epoch": 0.5204678974981936, + "grad_norm": 1.8462231159210205, + "learning_rate": 4.832179126167556e-07, + "loss": 0.467, + "step": 49700 + }, + { + "epoch": 0.5215151166078479, + "grad_norm": 2.27242374420166, + "learning_rate": 4.815404320761267e-07, + "loss": 0.4681, + "step": 49800 + }, + { + "epoch": 0.5225623357175022, + "grad_norm": 2.18723201751709, + "learning_rate": 4.7986315957041e-07, + "loss": 0.5005, + "step": 49900 + }, + { + "epoch": 0.5236095548271564, + "grad_norm": 3.0114426612854004, + "learning_rate": 4.781861140020652e-07, + "loss": 0.4861, + "step": 50000 + }, + { + "epoch": 0.5246567739368108, + "grad_norm": 2.07069730758667, + "learning_rate": 4.765093142709943e-07, + "loss": 0.4648, + "step": 50100 + }, + { + "epoch": 0.5257039930464651, + "grad_norm": 2.2993671894073486, + "learning_rate": 4.7483277927432924e-07, + "loss": 0.4835, + "step": 50200 + }, + { + "epoch": 0.5267512121561194, + "grad_norm": 2.224874258041382, + "learning_rate": 4.731565279062179e-07, + "loss": 0.4642, + "step": 50300 + }, + { + "epoch": 0.5277984312657737, + "grad_norm": 1.7376128435134888, + "learning_rate": 4.7148057905761187e-07, + "loss": 0.4883, + "step": 50400 + }, + { + "epoch": 0.5288456503754281, + "grad_norm": 3.3602840900421143, + "learning_rate": 4.698049516160539e-07, + "loss": 0.4762, + "step": 50500 + }, + { + "epoch": 0.5298928694850824, + "grad_norm": 1.7802869081497192, + "learning_rate": 4.681296644654639e-07, + "loss": 0.5264, + "step": 50600 + }, + { + "epoch": 0.5309400885947366, + "grad_norm": 1.8603919744491577, + "learning_rate": 4.6645473648592716e-07, + "loss": 0.4902, + "step": 50700 + }, + { + "epoch": 0.531987307704391, + "grad_norm": 2.204157590866089, + "learning_rate": 4.647801865534813e-07, + "loss": 0.4835, + "step": 50800 + }, + { + "epoch": 0.5330345268140453, + "grad_norm": 1.2694624662399292, + "learning_rate": 4.63106033539903e-07, + "loss": 0.5238, + "step": 50900 + }, + { + "epoch": 0.5340817459236996, + "grad_norm": 2.0624773502349854, + "learning_rate": 4.6143229631249596e-07, + "loss": 0.5033, + "step": 51000 + }, + { + "epoch": 0.5351289650333539, + "grad_norm": 1.9012243747711182, + "learning_rate": 4.597589937338784e-07, + "loss": 0.5076, + "step": 51100 + }, + { + "epoch": 0.5361761841430083, + "grad_norm": 2.1069536209106445, + "learning_rate": 4.580861446617698e-07, + "loss": 0.5171, + "step": 51200 + }, + { + "epoch": 0.5372234032526626, + "grad_norm": 1.5368138551712036, + "learning_rate": 4.564137679487785e-07, + "loss": 0.4803, + "step": 51300 + }, + { + "epoch": 0.5382706223623168, + "grad_norm": 1.5406559705734253, + "learning_rate": 4.5474188244219006e-07, + "loss": 0.4839, + "step": 51400 + }, + { + "epoch": 0.5393178414719711, + "grad_norm": 1.4071673154830933, + "learning_rate": 4.530705069837542e-07, + "loss": 0.4764, + "step": 51500 + }, + { + "epoch": 0.5403650605816255, + "grad_norm": 2.699596643447876, + "learning_rate": 4.513996604094716e-07, + "loss": 0.5177, + "step": 51600 + }, + { + "epoch": 0.5414122796912798, + "grad_norm": 1.542262315750122, + "learning_rate": 4.497293615493838e-07, + "loss": 0.508, + "step": 51700 + }, + { + "epoch": 0.5424594988009341, + "grad_norm": 3.0482521057128906, + "learning_rate": 4.480596292273592e-07, + "loss": 0.5303, + "step": 51800 + }, + { + "epoch": 0.5435067179105885, + "grad_norm": 2.214055061340332, + "learning_rate": 4.463904822608809e-07, + "loss": 0.4843, + "step": 51900 + }, + { + "epoch": 0.5445539370202428, + "grad_norm": 2.4003210067749023, + "learning_rate": 4.4472193946083634e-07, + "loss": 0.5024, + "step": 52000 + }, + { + "epoch": 0.545601156129897, + "grad_norm": 2.2942888736724854, + "learning_rate": 4.430540196313031e-07, + "loss": 0.5073, + "step": 52100 + }, + { + "epoch": 0.5466483752395513, + "grad_norm": 2.4813528060913086, + "learning_rate": 4.413867415693383e-07, + "loss": 0.5114, + "step": 52200 + }, + { + "epoch": 0.5476955943492057, + "grad_norm": 1.8171602487564087, + "learning_rate": 4.3972012406476715e-07, + "loss": 0.4714, + "step": 52300 + }, + { + "epoch": 0.54874281345886, + "grad_norm": 2.677717924118042, + "learning_rate": 4.3805418589996967e-07, + "loss": 0.5277, + "step": 52400 + }, + { + "epoch": 0.5497900325685143, + "grad_norm": 2.815244674682617, + "learning_rate": 4.363889458496701e-07, + "loss": 0.4969, + "step": 52500 + }, + { + "epoch": 0.5508372516781687, + "grad_norm": 2.719905376434326, + "learning_rate": 4.347244226807257e-07, + "loss": 0.494, + "step": 52600 + }, + { + "epoch": 0.551884470787823, + "grad_norm": 2.277196168899536, + "learning_rate": 4.3306063515191384e-07, + "loss": 0.4989, + "step": 52700 + }, + { + "epoch": 0.5529316898974772, + "grad_norm": 2.747807741165161, + "learning_rate": 4.3139760201372166e-07, + "loss": 0.475, + "step": 52800 + }, + { + "epoch": 0.5539789090071315, + "grad_norm": 2.1879899501800537, + "learning_rate": 4.29735342008135e-07, + "loss": 0.4727, + "step": 52900 + }, + { + "epoch": 0.5550261281167859, + "grad_norm": 1.5891708135604858, + "learning_rate": 4.280738738684259e-07, + "loss": 0.5209, + "step": 53000 + }, + { + "epoch": 0.5560733472264402, + "grad_norm": 2.6258082389831543, + "learning_rate": 4.2641321631894256e-07, + "loss": 0.5146, + "step": 53100 + }, + { + "epoch": 0.5571205663360945, + "grad_norm": 2.106497287750244, + "learning_rate": 4.2475338807489825e-07, + "loss": 0.5072, + "step": 53200 + }, + { + "epoch": 0.5581677854457489, + "grad_norm": 1.3520596027374268, + "learning_rate": 4.2309440784216014e-07, + "loss": 0.5007, + "step": 53300 + }, + { + "epoch": 0.5592150045554031, + "grad_norm": 2.2585766315460205, + "learning_rate": 4.21436294317038e-07, + "loss": 0.5661, + "step": 53400 + }, + { + "epoch": 0.5602622236650574, + "grad_norm": 2.4655063152313232, + "learning_rate": 4.1977906618607473e-07, + "loss": 0.5057, + "step": 53500 + }, + { + "epoch": 0.5613094427747117, + "grad_norm": 1.7120404243469238, + "learning_rate": 4.181227421258344e-07, + "loss": 0.4762, + "step": 53600 + }, + { + "epoch": 0.5623566618843661, + "grad_norm": 2.365668535232544, + "learning_rate": 4.164673408026932e-07, + "loss": 0.5015, + "step": 53700 + }, + { + "epoch": 0.5634038809940204, + "grad_norm": 2.5297205448150635, + "learning_rate": 4.148128808726274e-07, + "loss": 0.4789, + "step": 53800 + }, + { + "epoch": 0.5644511001036747, + "grad_norm": 2.997265577316284, + "learning_rate": 4.131593809810044e-07, + "loss": 0.4841, + "step": 53900 + }, + { + "epoch": 0.565498319213329, + "grad_norm": 2.2408447265625, + "learning_rate": 4.1150685976237253e-07, + "loss": 0.5194, + "step": 54000 + }, + { + "epoch": 0.5665455383229833, + "grad_norm": 1.8267594575881958, + "learning_rate": 4.098553358402503e-07, + "loss": 0.4978, + "step": 54100 + }, + { + "epoch": 0.5675927574326376, + "grad_norm": 3.2854866981506348, + "learning_rate": 4.0820482782691666e-07, + "loss": 0.499, + "step": 54200 + }, + { + "epoch": 0.5686399765422919, + "grad_norm": 2.401383638381958, + "learning_rate": 4.0655535432320225e-07, + "loss": 0.539, + "step": 54300 + }, + { + "epoch": 0.5696871956519463, + "grad_norm": 2.3308005332946777, + "learning_rate": 4.0490693391827867e-07, + "loss": 0.527, + "step": 54400 + }, + { + "epoch": 0.5707344147616006, + "grad_norm": 2.6808366775512695, + "learning_rate": 4.0325958518944893e-07, + "loss": 0.4965, + "step": 54500 + }, + { + "epoch": 0.5717816338712549, + "grad_norm": 2.82200026512146, + "learning_rate": 4.016133267019394e-07, + "loss": 0.5051, + "step": 54600 + }, + { + "epoch": 0.5728288529809092, + "grad_norm": 3.023541212081909, + "learning_rate": 3.99968177008689e-07, + "loss": 0.4623, + "step": 54700 + }, + { + "epoch": 0.5738760720905635, + "grad_norm": 2.405120372772217, + "learning_rate": 3.983241546501408e-07, + "loss": 0.5096, + "step": 54800 + }, + { + "epoch": 0.5749232912002178, + "grad_norm": 1.9728878736495972, + "learning_rate": 3.9668127815403353e-07, + "loss": 0.5405, + "step": 54900 + }, + { + "epoch": 0.5759705103098721, + "grad_norm": 3.312455415725708, + "learning_rate": 3.950395660351922e-07, + "loss": 0.5245, + "step": 55000 + }, + { + "epoch": 0.5770177294195264, + "grad_norm": 1.9875174760818481, + "learning_rate": 3.93399036795319e-07, + "loss": 0.4863, + "step": 55100 + }, + { + "epoch": 0.5780649485291808, + "grad_norm": 2.295588731765747, + "learning_rate": 3.917597089227863e-07, + "loss": 0.4868, + "step": 55200 + }, + { + "epoch": 0.5791121676388351, + "grad_norm": 2.505709409713745, + "learning_rate": 3.901216008924265e-07, + "loss": 0.4955, + "step": 55300 + }, + { + "epoch": 0.5801593867484894, + "grad_norm": 2.177341938018799, + "learning_rate": 3.88484731165325e-07, + "loss": 0.5103, + "step": 55400 + }, + { + "epoch": 0.5812066058581437, + "grad_norm": 1.426915168762207, + "learning_rate": 3.868491181886122e-07, + "loss": 0.5235, + "step": 55500 + }, + { + "epoch": 0.582253824967798, + "grad_norm": 2.258373498916626, + "learning_rate": 3.852147803952545e-07, + "loss": 0.4983, + "step": 55600 + }, + { + "epoch": 0.5833010440774523, + "grad_norm": 2.660693645477295, + "learning_rate": 3.835817362038477e-07, + "loss": 0.5127, + "step": 55700 + }, + { + "epoch": 0.5843482631871066, + "grad_norm": 2.2097291946411133, + "learning_rate": 3.8195000401840927e-07, + "loss": 0.5034, + "step": 55800 + }, + { + "epoch": 0.585395482296761, + "grad_norm": 2.2298669815063477, + "learning_rate": 3.803196022281701e-07, + "loss": 0.4971, + "step": 55900 + }, + { + "epoch": 0.5864427014064153, + "grad_norm": 2.1946804523468018, + "learning_rate": 3.78690549207368e-07, + "loss": 0.4942, + "step": 56000 + }, + { + "epoch": 0.5874899205160696, + "grad_norm": 3.2329068183898926, + "learning_rate": 3.77062863315041e-07, + "loss": 0.513, + "step": 56100 + }, + { + "epoch": 0.5885371396257238, + "grad_norm": 1.839722752571106, + "learning_rate": 3.7543656289481927e-07, + "loss": 0.5546, + "step": 56200 + }, + { + "epoch": 0.5895843587353782, + "grad_norm": 2.5834665298461914, + "learning_rate": 3.7381166627471914e-07, + "loss": 0.4821, + "step": 56300 + }, + { + "epoch": 0.5906315778450325, + "grad_norm": 2.00166916847229, + "learning_rate": 3.7218819176693693e-07, + "loss": 0.5187, + "step": 56400 + }, + { + "epoch": 0.5916787969546868, + "grad_norm": 3.0043110847473145, + "learning_rate": 3.7056615766764174e-07, + "loss": 0.5227, + "step": 56500 + }, + { + "epoch": 0.5927260160643412, + "grad_norm": 1.637872576713562, + "learning_rate": 3.6894558225676924e-07, + "loss": 0.4611, + "step": 56600 + }, + { + "epoch": 0.5937732351739955, + "grad_norm": 2.64483904838562, + "learning_rate": 3.6732648379781683e-07, + "loss": 0.4792, + "step": 56700 + }, + { + "epoch": 0.5948204542836497, + "grad_norm": 1.7451013326644897, + "learning_rate": 3.657088805376366e-07, + "loss": 0.5322, + "step": 56800 + }, + { + "epoch": 0.595867673393304, + "grad_norm": 2.465116500854492, + "learning_rate": 3.640927907062297e-07, + "loss": 0.4657, + "step": 56900 + }, + { + "epoch": 0.5969148925029584, + "grad_norm": 3.788491725921631, + "learning_rate": 3.624782325165421e-07, + "loss": 0.4855, + "step": 57000 + }, + { + "epoch": 0.5979621116126127, + "grad_norm": 2.519657850265503, + "learning_rate": 3.6086522416425823e-07, + "loss": 0.5125, + "step": 57100 + }, + { + "epoch": 0.599009330722267, + "grad_norm": 1.8677030801773071, + "learning_rate": 3.5925378382759577e-07, + "loss": 0.498, + "step": 57200 + }, + { + "epoch": 0.6000565498319214, + "grad_norm": 1.9577298164367676, + "learning_rate": 3.57643929667102e-07, + "loss": 0.4792, + "step": 57300 + }, + { + "epoch": 0.6011037689415757, + "grad_norm": 2.364872932434082, + "learning_rate": 3.560356798254477e-07, + "loss": 0.4882, + "step": 57400 + }, + { + "epoch": 0.6021509880512299, + "grad_norm": 2.4925103187561035, + "learning_rate": 3.5442905242722365e-07, + "loss": 0.4825, + "step": 57500 + }, + { + "epoch": 0.6031982071608842, + "grad_norm": 2.7740890979766846, + "learning_rate": 3.5282406557873635e-07, + "loss": 0.5345, + "step": 57600 + }, + { + "epoch": 0.6042454262705386, + "grad_norm": 1.0781739950180054, + "learning_rate": 3.512207373678032e-07, + "loss": 0.4665, + "step": 57700 + }, + { + "epoch": 0.6052926453801929, + "grad_norm": 2.9016547203063965, + "learning_rate": 3.496190858635494e-07, + "loss": 0.4655, + "step": 57800 + }, + { + "epoch": 0.6063398644898472, + "grad_norm": 0.917265772819519, + "learning_rate": 3.480191291162041e-07, + "loss": 0.4707, + "step": 57900 + }, + { + "epoch": 0.6073870835995016, + "grad_norm": 1.5372905731201172, + "learning_rate": 3.4642088515689695e-07, + "loss": 0.4867, + "step": 58000 + }, + { + "epoch": 0.6084343027091559, + "grad_norm": 1.8536443710327148, + "learning_rate": 3.4482437199745463e-07, + "loss": 0.4746, + "step": 58100 + }, + { + "epoch": 0.6094815218188101, + "grad_norm": 2.8087878227233887, + "learning_rate": 3.432296076301986e-07, + "loss": 0.5529, + "step": 58200 + }, + { + "epoch": 0.6105287409284644, + "grad_norm": 1.8362385034561157, + "learning_rate": 3.416366100277414e-07, + "loss": 0.4911, + "step": 58300 + }, + { + "epoch": 0.6115759600381188, + "grad_norm": 1.9666386842727661, + "learning_rate": 3.4004539714278457e-07, + "loss": 0.4902, + "step": 58400 + }, + { + "epoch": 0.6126231791477731, + "grad_norm": 1.745953917503357, + "learning_rate": 3.3845598690791675e-07, + "loss": 0.5204, + "step": 58500 + }, + { + "epoch": 0.6136703982574274, + "grad_norm": 1.9354580640792847, + "learning_rate": 3.368683972354108e-07, + "loss": 0.4763, + "step": 58600 + }, + { + "epoch": 0.6147176173670817, + "grad_norm": 2.232057809829712, + "learning_rate": 3.3528264601702217e-07, + "loss": 0.5116, + "step": 58700 + }, + { + "epoch": 0.6157648364767361, + "grad_norm": 2.1513118743896484, + "learning_rate": 3.336987511237877e-07, + "loss": 0.539, + "step": 58800 + }, + { + "epoch": 0.6168120555863903, + "grad_norm": 1.7164148092269897, + "learning_rate": 3.321167304058238e-07, + "loss": 0.4912, + "step": 58900 + }, + { + "epoch": 0.6178592746960446, + "grad_norm": 2.390707015991211, + "learning_rate": 3.305366016921249e-07, + "loss": 0.5207, + "step": 59000 + }, + { + "epoch": 0.618906493805699, + "grad_norm": 1.944360613822937, + "learning_rate": 3.289583827903639e-07, + "loss": 0.4786, + "step": 59100 + }, + { + "epoch": 0.6199537129153533, + "grad_norm": 3.611234426498413, + "learning_rate": 3.2738209148668996e-07, + "loss": 0.5597, + "step": 59200 + }, + { + "epoch": 0.6210009320250076, + "grad_norm": 2.125988245010376, + "learning_rate": 3.2580774554552834e-07, + "loss": 0.5064, + "step": 59300 + }, + { + "epoch": 0.6220481511346619, + "grad_norm": 2.2751822471618652, + "learning_rate": 3.242353627093817e-07, + "loss": 0.4839, + "step": 59400 + }, + { + "epoch": 0.6230953702443163, + "grad_norm": 2.4632444381713867, + "learning_rate": 3.226649606986277e-07, + "loss": 0.5085, + "step": 59500 + }, + { + "epoch": 0.6241425893539705, + "grad_norm": 2.596140146255493, + "learning_rate": 3.210965572113211e-07, + "loss": 0.4834, + "step": 59600 + }, + { + "epoch": 0.6251898084636248, + "grad_norm": 3.1402766704559326, + "learning_rate": 3.195301699229943e-07, + "loss": 0.4894, + "step": 59700 + }, + { + "epoch": 0.6262370275732791, + "grad_norm": 1.3100465536117554, + "learning_rate": 3.179658164864567e-07, + "loss": 0.5371, + "step": 59800 + }, + { + "epoch": 0.6272842466829335, + "grad_norm": 2.2746660709381104, + "learning_rate": 3.164035145315971e-07, + "loss": 0.4865, + "step": 59900 + }, + { + "epoch": 0.6283314657925878, + "grad_norm": 2.2843546867370605, + "learning_rate": 3.14843281665185e-07, + "loss": 0.4958, + "step": 60000 + }, + { + "epoch": 0.6293786849022421, + "grad_norm": 2.045327663421631, + "learning_rate": 3.132851354706709e-07, + "loss": 0.4747, + "step": 60100 + }, + { + "epoch": 0.6304259040118964, + "grad_norm": 2.59464430809021, + "learning_rate": 3.117290935079895e-07, + "loss": 0.4927, + "step": 60200 + }, + { + "epoch": 0.6314731231215507, + "grad_norm": 1.8439029455184937, + "learning_rate": 3.1017517331336175e-07, + "loss": 0.4829, + "step": 60300 + }, + { + "epoch": 0.632520342231205, + "grad_norm": 2.155336618423462, + "learning_rate": 3.0862339239909587e-07, + "loss": 0.4764, + "step": 60400 + }, + { + "epoch": 0.6335675613408593, + "grad_norm": 2.2298882007598877, + "learning_rate": 3.070737682533913e-07, + "loss": 0.5267, + "step": 60500 + }, + { + "epoch": 0.6346147804505137, + "grad_norm": 1.9075183868408203, + "learning_rate": 3.0552631834014153e-07, + "loss": 0.5101, + "step": 60600 + }, + { + "epoch": 0.635661999560168, + "grad_norm": 2.1493678092956543, + "learning_rate": 3.039810600987367e-07, + "loss": 0.455, + "step": 60700 + }, + { + "epoch": 0.6367092186698223, + "grad_norm": 1.9552183151245117, + "learning_rate": 3.024380109438669e-07, + "loss": 0.511, + "step": 60800 + }, + { + "epoch": 0.6377564377794765, + "grad_norm": 2.0828135013580322, + "learning_rate": 3.0089718826532727e-07, + "loss": 0.4816, + "step": 60900 + }, + { + "epoch": 0.6388036568891309, + "grad_norm": 1.6887547969818115, + "learning_rate": 2.9935860942782055e-07, + "loss": 0.4874, + "step": 61000 + }, + { + "epoch": 0.6398508759987852, + "grad_norm": 1.987060785293579, + "learning_rate": 2.978222917707616e-07, + "loss": 0.5237, + "step": 61100 + }, + { + "epoch": 0.6408980951084395, + "grad_norm": 1.8471943140029907, + "learning_rate": 2.9628825260808313e-07, + "loss": 0.4864, + "step": 61200 + }, + { + "epoch": 0.6419453142180939, + "grad_norm": 2.424875497817993, + "learning_rate": 2.9475650922803907e-07, + "loss": 0.4865, + "step": 61300 + }, + { + "epoch": 0.6429925333277482, + "grad_norm": 1.9071121215820312, + "learning_rate": 2.9322707889301066e-07, + "loss": 0.5097, + "step": 61400 + }, + { + "epoch": 0.6440397524374025, + "grad_norm": 1.9200624227523804, + "learning_rate": 2.9169997883931205e-07, + "loss": 0.4865, + "step": 61500 + }, + { + "epoch": 0.6450869715470567, + "grad_norm": 1.8281010389328003, + "learning_rate": 2.90175226276995e-07, + "loss": 0.4923, + "step": 61600 + }, + { + "epoch": 0.6461341906567111, + "grad_norm": 2.7019853591918945, + "learning_rate": 2.886528383896559e-07, + "loss": 0.4702, + "step": 61700 + }, + { + "epoch": 0.6471814097663654, + "grad_norm": 1.542846918106079, + "learning_rate": 2.87132832334242e-07, + "loss": 0.5025, + "step": 61800 + }, + { + "epoch": 0.6482286288760197, + "grad_norm": 3.2872512340545654, + "learning_rate": 2.856152252408578e-07, + "loss": 0.4896, + "step": 61900 + }, + { + "epoch": 0.6492758479856741, + "grad_norm": 3.8048501014709473, + "learning_rate": 2.841000342125719e-07, + "loss": 0.4723, + "step": 62000 + }, + { + "epoch": 0.6503230670953284, + "grad_norm": 2.0907108783721924, + "learning_rate": 2.825872763252245e-07, + "loss": 0.5326, + "step": 62100 + }, + { + "epoch": 0.6513702862049827, + "grad_norm": 2.4722342491149902, + "learning_rate": 2.81076968627235e-07, + "loss": 0.4774, + "step": 62200 + }, + { + "epoch": 0.6524175053146369, + "grad_norm": 2.449239492416382, + "learning_rate": 2.7956912813940947e-07, + "loss": 0.47, + "step": 62300 + }, + { + "epoch": 0.6534647244242913, + "grad_norm": 2.0104002952575684, + "learning_rate": 2.7806377185474953e-07, + "loss": 0.5017, + "step": 62400 + }, + { + "epoch": 0.6545119435339456, + "grad_norm": 2.3968191146850586, + "learning_rate": 2.765609167382602e-07, + "loss": 0.489, + "step": 62500 + }, + { + "epoch": 0.6555591626435999, + "grad_norm": 2.0325634479522705, + "learning_rate": 2.750605797267587e-07, + "loss": 0.5153, + "step": 62600 + }, + { + "epoch": 0.6566063817532543, + "grad_norm": 2.9563980102539062, + "learning_rate": 2.7356277772868427e-07, + "loss": 0.5121, + "step": 62700 + }, + { + "epoch": 0.6576536008629086, + "grad_norm": 1.5260460376739502, + "learning_rate": 2.7206752762390684e-07, + "loss": 0.5009, + "step": 62800 + }, + { + "epoch": 0.6587008199725629, + "grad_norm": 2.651346206665039, + "learning_rate": 2.7057484626353717e-07, + "loss": 0.4819, + "step": 62900 + }, + { + "epoch": 0.6597480390822171, + "grad_norm": 2.392993927001953, + "learning_rate": 2.69084750469737e-07, + "loss": 0.4924, + "step": 63000 + }, + { + "epoch": 0.6607952581918715, + "grad_norm": 2.065648078918457, + "learning_rate": 2.6759725703552916e-07, + "loss": 0.4576, + "step": 63100 + }, + { + "epoch": 0.6618424773015258, + "grad_norm": 1.6166179180145264, + "learning_rate": 2.661123827246088e-07, + "loss": 0.5187, + "step": 63200 + }, + { + "epoch": 0.6628896964111801, + "grad_norm": 2.0667145252227783, + "learning_rate": 2.646301442711538e-07, + "loss": 0.4963, + "step": 63300 + }, + { + "epoch": 0.6639369155208344, + "grad_norm": 3.5013437271118164, + "learning_rate": 2.6315055837963687e-07, + "loss": 0.5027, + "step": 63400 + }, + { + "epoch": 0.6649841346304888, + "grad_norm": 0.9413002133369446, + "learning_rate": 2.616736417246368e-07, + "loss": 0.4712, + "step": 63500 + }, + { + "epoch": 0.666031353740143, + "grad_norm": 1.4072952270507812, + "learning_rate": 2.601994109506508e-07, + "loss": 0.4731, + "step": 63600 + }, + { + "epoch": 0.6670785728497973, + "grad_norm": 2.4212138652801514, + "learning_rate": 2.587278826719069e-07, + "loss": 0.4828, + "step": 63700 + }, + { + "epoch": 0.6681257919594517, + "grad_norm": 1.7635606527328491, + "learning_rate": 2.5725907347217655e-07, + "loss": 0.4863, + "step": 63800 + }, + { + "epoch": 0.669173011069106, + "grad_norm": 2.0671000480651855, + "learning_rate": 2.5579299990458785e-07, + "loss": 0.4636, + "step": 63900 + }, + { + "epoch": 0.6702202301787603, + "grad_norm": 2.378913402557373, + "learning_rate": 2.5432967849143906e-07, + "loss": 0.4766, + "step": 64000 + }, + { + "epoch": 0.6712674492884146, + "grad_norm": 3.7450199127197266, + "learning_rate": 2.528691257240122e-07, + "loss": 0.5137, + "step": 64100 + }, + { + "epoch": 0.672314668398069, + "grad_norm": 2.676037073135376, + "learning_rate": 2.514113580623873e-07, + "loss": 0.4933, + "step": 64200 + }, + { + "epoch": 0.6733618875077232, + "grad_norm": 1.6275851726531982, + "learning_rate": 2.499563919352572e-07, + "loss": 0.5038, + "step": 64300 + }, + { + "epoch": 0.6744091066173775, + "grad_norm": 2.475569009780884, + "learning_rate": 2.485042437397418e-07, + "loss": 0.4518, + "step": 64400 + }, + { + "epoch": 0.6754563257270318, + "grad_norm": 3.2226366996765137, + "learning_rate": 2.470549298412036e-07, + "loss": 0.4634, + "step": 64500 + }, + { + "epoch": 0.6765035448366862, + "grad_norm": 2.9092655181884766, + "learning_rate": 2.456084665730634e-07, + "loss": 0.4851, + "step": 64600 + }, + { + "epoch": 0.6775507639463405, + "grad_norm": 1.9740290641784668, + "learning_rate": 2.441648702366161e-07, + "loss": 0.489, + "step": 64700 + }, + { + "epoch": 0.6785979830559948, + "grad_norm": 2.2705118656158447, + "learning_rate": 2.42724157100847e-07, + "loss": 0.4918, + "step": 64800 + }, + { + "epoch": 0.6796452021656492, + "grad_norm": 2.0279767513275146, + "learning_rate": 2.4128634340224767e-07, + "loss": 0.5309, + "step": 64900 + }, + { + "epoch": 0.6806924212753034, + "grad_norm": 2.4952125549316406, + "learning_rate": 2.3985144534463507e-07, + "loss": 0.5253, + "step": 65000 + }, + { + "epoch": 0.6817396403849577, + "grad_norm": 1.7526471614837646, + "learning_rate": 2.3841947909896675e-07, + "loss": 0.4919, + "step": 65100 + }, + { + "epoch": 0.682786859494612, + "grad_norm": 2.78068208694458, + "learning_rate": 2.369904608031591e-07, + "loss": 0.4678, + "step": 65200 + }, + { + "epoch": 0.6838340786042664, + "grad_norm": 1.9609248638153076, + "learning_rate": 2.3556440656190675e-07, + "loss": 0.5004, + "step": 65300 + }, + { + "epoch": 0.6848812977139207, + "grad_norm": 1.8966784477233887, + "learning_rate": 2.3414133244649965e-07, + "loss": 0.4609, + "step": 65400 + }, + { + "epoch": 0.685928516823575, + "grad_norm": 1.7883254289627075, + "learning_rate": 2.3272125449464197e-07, + "loss": 0.5053, + "step": 65500 + }, + { + "epoch": 0.6869757359332294, + "grad_norm": 2.0737862586975098, + "learning_rate": 2.3130418871027285e-07, + "loss": 0.5126, + "step": 65600 + }, + { + "epoch": 0.6880229550428836, + "grad_norm": 2.2858548164367676, + "learning_rate": 2.2989015106338456e-07, + "loss": 0.4954, + "step": 65700 + }, + { + "epoch": 0.6890701741525379, + "grad_norm": 2.121546506881714, + "learning_rate": 2.284791574898423e-07, + "loss": 0.5017, + "step": 65800 + }, + { + "epoch": 0.6901173932621922, + "grad_norm": 1.6191834211349487, + "learning_rate": 2.270712238912067e-07, + "loss": 0.4721, + "step": 65900 + }, + { + "epoch": 0.6911646123718466, + "grad_norm": 2.482290506362915, + "learning_rate": 2.2566636613455185e-07, + "loss": 0.5003, + "step": 66000 + }, + { + "epoch": 0.6922118314815009, + "grad_norm": 2.413865089416504, + "learning_rate": 2.242646000522885e-07, + "loss": 0.4864, + "step": 66100 + }, + { + "epoch": 0.6932590505911552, + "grad_norm": 2.390326738357544, + "learning_rate": 2.228659414419853e-07, + "loss": 0.5155, + "step": 66200 + }, + { + "epoch": 0.6943062697008096, + "grad_norm": 2.158834457397461, + "learning_rate": 2.2147040606618956e-07, + "loss": 0.4972, + "step": 66300 + }, + { + "epoch": 0.6953534888104638, + "grad_norm": 2.767620086669922, + "learning_rate": 2.2007800965225087e-07, + "loss": 0.4651, + "step": 66400 + }, + { + "epoch": 0.6964007079201181, + "grad_norm": 3.050821542739868, + "learning_rate": 2.1868876789214418e-07, + "loss": 0.5146, + "step": 66500 + }, + { + "epoch": 0.6974479270297724, + "grad_norm": 2.7702839374542236, + "learning_rate": 2.1730269644229104e-07, + "loss": 0.5143, + "step": 66600 + }, + { + "epoch": 0.6984951461394268, + "grad_norm": 2.543748140335083, + "learning_rate": 2.159198109233849e-07, + "loss": 0.5028, + "step": 66700 + }, + { + "epoch": 0.6995423652490811, + "grad_norm": 3.739572048187256, + "learning_rate": 2.1454012692021505e-07, + "loss": 0.5471, + "step": 66800 + }, + { + "epoch": 0.7005895843587354, + "grad_norm": 2.372471809387207, + "learning_rate": 2.131636599814896e-07, + "loss": 0.4978, + "step": 66900 + }, + { + "epoch": 0.7016368034683896, + "grad_norm": 2.276508092880249, + "learning_rate": 2.1179042561966154e-07, + "loss": 0.5153, + "step": 67000 + }, + { + "epoch": 0.702684022578044, + "grad_norm": 2.0715689659118652, + "learning_rate": 2.1042043931075342e-07, + "loss": 0.5127, + "step": 67100 + }, + { + "epoch": 0.7037312416876983, + "grad_norm": 1.9307739734649658, + "learning_rate": 2.0905371649418318e-07, + "loss": 0.4746, + "step": 67200 + }, + { + "epoch": 0.7047784607973526, + "grad_norm": 2.039501905441284, + "learning_rate": 2.076902725725897e-07, + "loss": 0.4952, + "step": 67300 + }, + { + "epoch": 0.705825679907007, + "grad_norm": 2.397334575653076, + "learning_rate": 2.063301229116597e-07, + "loss": 0.4728, + "step": 67400 + }, + { + "epoch": 0.7068728990166613, + "grad_norm": 3.5085904598236084, + "learning_rate": 2.0497328283995425e-07, + "loss": 0.5176, + "step": 67500 + }, + { + "epoch": 0.7079201181263156, + "grad_norm": 2.772425651550293, + "learning_rate": 2.0361976764873623e-07, + "loss": 0.5159, + "step": 67600 + }, + { + "epoch": 0.7089673372359698, + "grad_norm": 1.3938500881195068, + "learning_rate": 2.0226959259179794e-07, + "loss": 0.4949, + "step": 67700 + }, + { + "epoch": 0.7100145563456242, + "grad_norm": 2.1697475910186768, + "learning_rate": 2.0092277288528898e-07, + "loss": 0.466, + "step": 67800 + }, + { + "epoch": 0.7110617754552785, + "grad_norm": 1.512786865234375, + "learning_rate": 1.995793237075452e-07, + "loss": 0.5185, + "step": 67900 + }, + { + "epoch": 0.7121089945649328, + "grad_norm": 1.7060164213180542, + "learning_rate": 1.9823926019891724e-07, + "loss": 0.4649, + "step": 68000 + }, + { + "epoch": 0.7131562136745871, + "grad_norm": 2.2003238201141357, + "learning_rate": 1.9690259746160005e-07, + "loss": 0.4921, + "step": 68100 + }, + { + "epoch": 0.7142034327842415, + "grad_norm": 2.538870096206665, + "learning_rate": 1.9556935055946277e-07, + "loss": 0.5164, + "step": 68200 + }, + { + "epoch": 0.7152506518938958, + "grad_norm": 3.6677184104919434, + "learning_rate": 1.9423953451787888e-07, + "loss": 0.5299, + "step": 68300 + }, + { + "epoch": 0.71629787100355, + "grad_norm": 1.810766339302063, + "learning_rate": 1.929131643235569e-07, + "loss": 0.4917, + "step": 68400 + }, + { + "epoch": 0.7173450901132044, + "grad_norm": 1.973241925239563, + "learning_rate": 1.9159025492437143e-07, + "loss": 0.4827, + "step": 68500 + }, + { + "epoch": 0.7183923092228587, + "grad_norm": 2.1515488624572754, + "learning_rate": 1.9027082122919474e-07, + "loss": 0.4748, + "step": 68600 + }, + { + "epoch": 0.719439528332513, + "grad_norm": 1.521958827972412, + "learning_rate": 1.8895487810772882e-07, + "loss": 0.5087, + "step": 68700 + }, + { + "epoch": 0.7204867474421673, + "grad_norm": 2.1833043098449707, + "learning_rate": 1.876424403903376e-07, + "loss": 0.4784, + "step": 68800 + }, + { + "epoch": 0.7215339665518217, + "grad_norm": 2.8621373176574707, + "learning_rate": 1.8633352286788011e-07, + "loss": 0.5077, + "step": 68900 + }, + { + "epoch": 0.722581185661476, + "grad_norm": 1.9079474210739136, + "learning_rate": 1.8502814029154367e-07, + "loss": 0.5052, + "step": 69000 + }, + { + "epoch": 0.7236284047711302, + "grad_norm": 2.184054374694824, + "learning_rate": 1.837263073726769e-07, + "loss": 0.5109, + "step": 69100 + }, + { + "epoch": 0.7246756238807845, + "grad_norm": 2.0883328914642334, + "learning_rate": 1.824280387826258e-07, + "loss": 0.4888, + "step": 69200 + }, + { + "epoch": 0.7257228429904389, + "grad_norm": 2.368727207183838, + "learning_rate": 1.8113334915256663e-07, + "loss": 0.4963, + "step": 69300 + }, + { + "epoch": 0.7267700621000932, + "grad_norm": 2.7945289611816406, + "learning_rate": 1.7984225307334106e-07, + "loss": 0.4927, + "step": 69400 + }, + { + "epoch": 0.7278172812097475, + "grad_norm": 1.937376856803894, + "learning_rate": 1.7855476509529337e-07, + "loss": 0.4741, + "step": 69500 + }, + { + "epoch": 0.7288645003194019, + "grad_norm": 3.4460761547088623, + "learning_rate": 1.7727089972810505e-07, + "loss": 0.569, + "step": 69600 + }, + { + "epoch": 0.7299117194290562, + "grad_norm": 3.9340882301330566, + "learning_rate": 1.7599067144063086e-07, + "loss": 0.5028, + "step": 69700 + }, + { + "epoch": 0.7309589385387104, + "grad_norm": 3.2756307125091553, + "learning_rate": 1.7471409466073772e-07, + "loss": 0.5238, + "step": 69800 + }, + { + "epoch": 0.7320061576483647, + "grad_norm": 2.0363681316375732, + "learning_rate": 1.7344118377514044e-07, + "loss": 0.5528, + "step": 69900 + }, + { + "epoch": 0.7330533767580191, + "grad_norm": 2.6508500576019287, + "learning_rate": 1.7217195312923944e-07, + "loss": 0.4733, + "step": 70000 + }, + { + "epoch": 0.7341005958676734, + "grad_norm": 1.832088828086853, + "learning_rate": 1.7090641702696102e-07, + "loss": 0.4909, + "step": 70100 + }, + { + "epoch": 0.7351478149773277, + "grad_norm": 2.644780158996582, + "learning_rate": 1.6964458973059358e-07, + "loss": 0.4928, + "step": 70200 + }, + { + "epoch": 0.7361950340869821, + "grad_norm": 2.407883644104004, + "learning_rate": 1.683864854606289e-07, + "loss": 0.4497, + "step": 70300 + }, + { + "epoch": 0.7372422531966363, + "grad_norm": 2.3634557723999023, + "learning_rate": 1.6713211839560125e-07, + "loss": 0.4738, + "step": 70400 + }, + { + "epoch": 0.7382894723062906, + "grad_norm": 2.401092052459717, + "learning_rate": 1.658815026719269e-07, + "loss": 0.5084, + "step": 70500 + }, + { + "epoch": 0.7393366914159449, + "grad_norm": 2.105447292327881, + "learning_rate": 1.6463465238374568e-07, + "loss": 0.4681, + "step": 70600 + }, + { + "epoch": 0.7403839105255993, + "grad_norm": 2.5298540592193604, + "learning_rate": 1.633915815827623e-07, + "loss": 0.5149, + "step": 70700 + }, + { + "epoch": 0.7414311296352536, + "grad_norm": 2.3362057209014893, + "learning_rate": 1.621523042780868e-07, + "loss": 0.5225, + "step": 70800 + }, + { + "epoch": 0.7424783487449079, + "grad_norm": 3.7627904415130615, + "learning_rate": 1.6091683443607767e-07, + "loss": 0.4967, + "step": 70900 + }, + { + "epoch": 0.7435255678545623, + "grad_norm": 2.4007790088653564, + "learning_rate": 1.5968518598018483e-07, + "loss": 0.4878, + "step": 71000 + }, + { + "epoch": 0.7445727869642165, + "grad_norm": 2.1650781631469727, + "learning_rate": 1.5845737279079118e-07, + "loss": 0.502, + "step": 71100 + }, + { + "epoch": 0.7456200060738708, + "grad_norm": 1.9574668407440186, + "learning_rate": 1.5723340870505753e-07, + "loss": 0.4843, + "step": 71200 + }, + { + "epoch": 0.7466672251835251, + "grad_norm": 2.2389516830444336, + "learning_rate": 1.5601330751676624e-07, + "loss": 0.519, + "step": 71300 + }, + { + "epoch": 0.7477144442931795, + "grad_norm": 1.7965580224990845, + "learning_rate": 1.5479708297616567e-07, + "loss": 0.4676, + "step": 71400 + }, + { + "epoch": 0.7487616634028338, + "grad_norm": 2.057460069656372, + "learning_rate": 1.5358474878981526e-07, + "loss": 0.5106, + "step": 71500 + }, + { + "epoch": 0.7498088825124881, + "grad_norm": 2.1372034549713135, + "learning_rate": 1.5237631862043115e-07, + "loss": 0.4786, + "step": 71600 + }, + { + "epoch": 0.7508561016221424, + "grad_norm": 2.0700478553771973, + "learning_rate": 1.5117180608673203e-07, + "loss": 0.4855, + "step": 71700 + }, + { + "epoch": 0.7519033207317967, + "grad_norm": 1.7832368612289429, + "learning_rate": 1.4997122476328593e-07, + "loss": 0.5188, + "step": 71800 + }, + { + "epoch": 0.752950539841451, + "grad_norm": 3.6390135288238525, + "learning_rate": 1.4877458818035705e-07, + "loss": 0.5304, + "step": 71900 + }, + { + "epoch": 0.7539977589511053, + "grad_norm": 3.022871732711792, + "learning_rate": 1.4758190982375295e-07, + "loss": 0.4648, + "step": 72000 + }, + { + "epoch": 0.7550449780607597, + "grad_norm": 1.6055036783218384, + "learning_rate": 1.463932031346739e-07, + "loss": 0.5118, + "step": 72100 + }, + { + "epoch": 0.756092197170414, + "grad_norm": 4.166171550750732, + "learning_rate": 1.4520848150955912e-07, + "loss": 0.4986, + "step": 72200 + }, + { + "epoch": 0.7571394162800683, + "grad_norm": 3.3419265747070312, + "learning_rate": 1.44027758299938e-07, + "loss": 0.5049, + "step": 72300 + }, + { + "epoch": 0.7581866353897226, + "grad_norm": 3.171034336090088, + "learning_rate": 1.4285104681227854e-07, + "loss": 0.5091, + "step": 72400 + }, + { + "epoch": 0.7592338544993769, + "grad_norm": 2.6404178142547607, + "learning_rate": 1.4167836030783752e-07, + "loss": 0.5208, + "step": 72500 + }, + { + "epoch": 0.7602810736090312, + "grad_norm": 2.8442752361297607, + "learning_rate": 1.4050971200251115e-07, + "loss": 0.475, + "step": 72600 + }, + { + "epoch": 0.7613282927186855, + "grad_norm": 1.9694572687149048, + "learning_rate": 1.3934511506668616e-07, + "loss": 0.4477, + "step": 72700 + }, + { + "epoch": 0.7623755118283398, + "grad_norm": 3.6044440269470215, + "learning_rate": 1.3818458262509119e-07, + "loss": 0.4972, + "step": 72800 + }, + { + "epoch": 0.7634227309379942, + "grad_norm": 1.7680317163467407, + "learning_rate": 1.3702812775664917e-07, + "loss": 0.4964, + "step": 72900 + }, + { + "epoch": 0.7644699500476485, + "grad_norm": 1.948326587677002, + "learning_rate": 1.358757634943296e-07, + "loss": 0.4733, + "step": 73000 + }, + { + "epoch": 0.7655171691573028, + "grad_norm": 2.4567108154296875, + "learning_rate": 1.3472750282500195e-07, + "loss": 0.5247, + "step": 73100 + }, + { + "epoch": 0.766564388266957, + "grad_norm": 1.3387149572372437, + "learning_rate": 1.3358335868928906e-07, + "loss": 0.4894, + "step": 73200 + }, + { + "epoch": 0.7676116073766114, + "grad_norm": 1.793434977531433, + "learning_rate": 1.3244334398142154e-07, + "loss": 0.5103, + "step": 73300 + }, + { + "epoch": 0.7686588264862657, + "grad_norm": 2.429433822631836, + "learning_rate": 1.3130747154909227e-07, + "loss": 0.5304, + "step": 73400 + }, + { + "epoch": 0.76970604559592, + "grad_norm": 2.3653488159179688, + "learning_rate": 1.3017575419331173e-07, + "loss": 0.5092, + "step": 73500 + }, + { + "epoch": 0.7707532647055744, + "grad_norm": 3.5659842491149902, + "learning_rate": 1.2904820466826355e-07, + "loss": 0.4835, + "step": 73600 + }, + { + "epoch": 0.7718004838152287, + "grad_norm": 2.952862501144409, + "learning_rate": 1.279248356811611e-07, + "loss": 0.5015, + "step": 73700 + }, + { + "epoch": 0.7728477029248829, + "grad_norm": 2.398303508758545, + "learning_rate": 1.2680565989210385e-07, + "loss": 0.4938, + "step": 73800 + }, + { + "epoch": 0.7738949220345372, + "grad_norm": 2.317095994949341, + "learning_rate": 1.2569068991393523e-07, + "loss": 0.4617, + "step": 73900 + }, + { + "epoch": 0.7749421411441916, + "grad_norm": 2.453432559967041, + "learning_rate": 1.2457993831209989e-07, + "loss": 0.5198, + "step": 74000 + }, + { + "epoch": 0.7759893602538459, + "grad_norm": 1.8672329187393188, + "learning_rate": 1.2347341760450263e-07, + "loss": 0.4742, + "step": 74100 + }, + { + "epoch": 0.7770365793635002, + "grad_norm": 3.076641798019409, + "learning_rate": 1.223711402613669e-07, + "loss": 0.4928, + "step": 74200 + }, + { + "epoch": 0.7780837984731546, + "grad_norm": 2.7013864517211914, + "learning_rate": 1.212731187050946e-07, + "loss": 0.4565, + "step": 74300 + }, + { + "epoch": 0.7791310175828089, + "grad_norm": 3.7489242553710938, + "learning_rate": 1.2017936531012574e-07, + "loss": 0.5017, + "step": 74400 + }, + { + "epoch": 0.7801782366924631, + "grad_norm": 2.7046327590942383, + "learning_rate": 1.1908989240279938e-07, + "loss": 0.4551, + "step": 74500 + }, + { + "epoch": 0.7812254558021174, + "grad_norm": 1.9993566274642944, + "learning_rate": 1.1800471226121456e-07, + "loss": 0.4742, + "step": 74600 + }, + { + "epoch": 0.7822726749117718, + "grad_norm": 2.9598634243011475, + "learning_rate": 1.1692383711509129e-07, + "loss": 0.5121, + "step": 74700 + }, + { + "epoch": 0.7833198940214261, + "grad_norm": 3.2795605659484863, + "learning_rate": 1.158472791456342e-07, + "loss": 0.5344, + "step": 74800 + }, + { + "epoch": 0.7843671131310804, + "grad_norm": 1.8576877117156982, + "learning_rate": 1.1477505048539387e-07, + "loss": 0.4924, + "step": 74900 + }, + { + "epoch": 0.7854143322407348, + "grad_norm": 1.8820946216583252, + "learning_rate": 1.1370716321813029e-07, + "loss": 0.4794, + "step": 75000 + }, + { + "epoch": 0.7864615513503891, + "grad_norm": 3.3854475021362305, + "learning_rate": 1.1264362937867784e-07, + "loss": 0.4841, + "step": 75100 + }, + { + "epoch": 0.7875087704600433, + "grad_norm": 3.2768609523773193, + "learning_rate": 1.1158446095280821e-07, + "loss": 0.4802, + "step": 75200 + }, + { + "epoch": 0.7885559895696976, + "grad_norm": 2.02317476272583, + "learning_rate": 1.1052966987709572e-07, + "loss": 0.4762, + "step": 75300 + }, + { + "epoch": 0.789603208679352, + "grad_norm": 2.08528208732605, + "learning_rate": 1.0947926803878366e-07, + "loss": 0.5083, + "step": 75400 + }, + { + "epoch": 0.7906504277890063, + "grad_norm": 2.0258214473724365, + "learning_rate": 1.0843326727564945e-07, + "loss": 0.4927, + "step": 75500 + }, + { + "epoch": 0.7916976468986606, + "grad_norm": 3.184265375137329, + "learning_rate": 1.0739167937587079e-07, + "loss": 0.5066, + "step": 75600 + }, + { + "epoch": 0.792744866008315, + "grad_norm": 2.808084011077881, + "learning_rate": 1.0635451607789469e-07, + "loss": 0.5172, + "step": 75700 + }, + { + "epoch": 0.7937920851179693, + "grad_norm": 2.172506332397461, + "learning_rate": 1.0532178907030275e-07, + "loss": 0.4797, + "step": 75800 + }, + { + "epoch": 0.7948393042276235, + "grad_norm": 1.9276924133300781, + "learning_rate": 1.0429350999168119e-07, + "loss": 0.5057, + "step": 75900 + }, + { + "epoch": 0.7958865233372778, + "grad_norm": 2.1610867977142334, + "learning_rate": 1.0326969043048955e-07, + "loss": 0.4964, + "step": 76000 + }, + { + "epoch": 0.7969337424469322, + "grad_norm": 2.5907599925994873, + "learning_rate": 1.0225034192492876e-07, + "loss": 0.4886, + "step": 76100 + }, + { + "epoch": 0.7979809615565865, + "grad_norm": 1.8623499870300293, + "learning_rate": 1.0123547596281257e-07, + "loss": 0.5151, + "step": 76200 + }, + { + "epoch": 0.7990281806662408, + "grad_norm": 1.7319766283035278, + "learning_rate": 1.0022510398143785e-07, + "loss": 0.4983, + "step": 76300 + }, + { + "epoch": 0.8000753997758951, + "grad_norm": 3.9193685054779053, + "learning_rate": 9.921923736745452e-08, + "loss": 0.5011, + "step": 76400 + }, + { + "epoch": 0.8011226188855495, + "grad_norm": 1.8976281881332397, + "learning_rate": 9.821788745673864e-08, + "loss": 0.5036, + "step": 76500 + }, + { + "epoch": 0.8021698379952037, + "grad_norm": 2.426635980606079, + "learning_rate": 9.722106553426446e-08, + "loss": 0.4993, + "step": 76600 + }, + { + "epoch": 0.803217057104858, + "grad_norm": 1.929158329963684, + "learning_rate": 9.622878283397596e-08, + "loss": 0.515, + "step": 76700 + }, + { + "epoch": 0.8042642762145124, + "grad_norm": 3.309342622756958, + "learning_rate": 9.524105053866182e-08, + "loss": 0.5395, + "step": 76800 + }, + { + "epoch": 0.8053114953241667, + "grad_norm": 1.8991940021514893, + "learning_rate": 9.425787977982869e-08, + "loss": 0.5079, + "step": 76900 + }, + { + "epoch": 0.806358714433821, + "grad_norm": 2.271533250808716, + "learning_rate": 9.32792816375756e-08, + "loss": 0.4579, + "step": 77000 + }, + { + "epoch": 0.8074059335434753, + "grad_norm": 2.1554083824157715, + "learning_rate": 9.230526714046944e-08, + "loss": 0.4556, + "step": 77100 + }, + { + "epoch": 0.8084531526531297, + "grad_norm": 1.8269262313842773, + "learning_rate": 9.133584726542037e-08, + "loss": 0.4883, + "step": 77200 + }, + { + "epoch": 0.8095003717627839, + "grad_norm": 2.5304064750671387, + "learning_rate": 9.037103293755849e-08, + "loss": 0.4977, + "step": 77300 + }, + { + "epoch": 0.8105475908724382, + "grad_norm": 2.8901185989379883, + "learning_rate": 8.941083503011021e-08, + "loss": 0.5063, + "step": 77400 + }, + { + "epoch": 0.8115948099820925, + "grad_norm": 2.2524912357330322, + "learning_rate": 8.845526436427625e-08, + "loss": 0.5144, + "step": 77500 + }, + { + "epoch": 0.8126420290917469, + "grad_norm": 2.046915292739868, + "learning_rate": 8.750433170910915e-08, + "loss": 0.4933, + "step": 77600 + }, + { + "epoch": 0.8136892482014012, + "grad_norm": 2.644960641860962, + "learning_rate": 8.655804778139247e-08, + "loss": 0.4962, + "step": 77700 + }, + { + "epoch": 0.8147364673110555, + "grad_norm": 2.299511432647705, + "learning_rate": 8.561642324551954e-08, + "loss": 0.4546, + "step": 77800 + }, + { + "epoch": 0.8157836864207098, + "grad_norm": 2.5044310092926025, + "learning_rate": 8.467946871337344e-08, + "loss": 0.4768, + "step": 77900 + }, + { + "epoch": 0.8168309055303641, + "grad_norm": 1.8609235286712646, + "learning_rate": 8.374719474420749e-08, + "loss": 0.4724, + "step": 78000 + }, + { + "epoch": 0.8178781246400184, + "grad_norm": 1.9416966438293457, + "learning_rate": 8.281961184452629e-08, + "loss": 0.4956, + "step": 78100 + }, + { + "epoch": 0.8189253437496727, + "grad_norm": 2.851625919342041, + "learning_rate": 8.189673046796702e-08, + "loss": 0.5068, + "step": 78200 + }, + { + "epoch": 0.8199725628593271, + "grad_norm": 2.262005567550659, + "learning_rate": 8.097856101518186e-08, + "loss": 0.4846, + "step": 78300 + }, + { + "epoch": 0.8210197819689814, + "grad_norm": 2.1528186798095703, + "learning_rate": 8.00651138337209e-08, + "loss": 0.4776, + "step": 78400 + }, + { + "epoch": 0.8220670010786357, + "grad_norm": 2.505295991897583, + "learning_rate": 7.915639921791511e-08, + "loss": 0.5012, + "step": 78500 + }, + { + "epoch": 0.82311422018829, + "grad_norm": 2.5964581966400146, + "learning_rate": 7.825242740876081e-08, + "loss": 0.5111, + "step": 78600 + }, + { + "epoch": 0.8241614392979443, + "grad_norm": 2.3113765716552734, + "learning_rate": 7.735320859380384e-08, + "loss": 0.5262, + "step": 78700 + }, + { + "epoch": 0.8252086584075986, + "grad_norm": 1.8016088008880615, + "learning_rate": 7.645875290702519e-08, + "loss": 0.4794, + "step": 78800 + }, + { + "epoch": 0.8262558775172529, + "grad_norm": 2.7183265686035156, + "learning_rate": 7.556907042872601e-08, + "loss": 0.5013, + "step": 78900 + }, + { + "epoch": 0.8273030966269073, + "grad_norm": 1.6194109916687012, + "learning_rate": 7.46841711854152e-08, + "loss": 0.4662, + "step": 79000 + }, + { + "epoch": 0.8283503157365616, + "grad_norm": 1.8583705425262451, + "learning_rate": 7.38040651496955e-08, + "loss": 0.4602, + "step": 79100 + }, + { + "epoch": 0.8293975348462159, + "grad_norm": 2.0989129543304443, + "learning_rate": 7.292876224015082e-08, + "loss": 0.4922, + "step": 79200 + }, + { + "epoch": 0.8304447539558701, + "grad_norm": 2.0418784618377686, + "learning_rate": 7.205827232123585e-08, + "loss": 0.5032, + "step": 79300 + }, + { + "epoch": 0.8314919730655245, + "grad_norm": 2.34555983543396, + "learning_rate": 7.119260520316368e-08, + "loss": 0.4912, + "step": 79400 + }, + { + "epoch": 0.8325391921751788, + "grad_norm": 2.5016937255859375, + "learning_rate": 7.033177064179507e-08, + "loss": 0.4792, + "step": 79500 + }, + { + "epoch": 0.8335864112848331, + "grad_norm": 2.4543182849884033, + "learning_rate": 6.947577833852991e-08, + "loss": 0.4713, + "step": 79600 + }, + { + "epoch": 0.8346336303944875, + "grad_norm": 2.092000961303711, + "learning_rate": 6.862463794019657e-08, + "loss": 0.4607, + "step": 79700 + }, + { + "epoch": 0.8356808495041418, + "grad_norm": 2.430490255355835, + "learning_rate": 6.777835903894324e-08, + "loss": 0.5018, + "step": 79800 + }, + { + "epoch": 0.8367280686137961, + "grad_norm": 1.815276026725769, + "learning_rate": 6.69369511721311e-08, + "loss": 0.4967, + "step": 79900 + }, + { + "epoch": 0.8377752877234503, + "grad_norm": 2.1097006797790527, + "learning_rate": 6.610042382222497e-08, + "loss": 0.4601, + "step": 80000 + }, + { + "epoch": 0.8388225068331047, + "grad_norm": 3.367506504058838, + "learning_rate": 6.526878641668798e-08, + "loss": 0.4913, + "step": 80100 + }, + { + "epoch": 0.839869725942759, + "grad_norm": 1.4861557483673096, + "learning_rate": 6.444204832787486e-08, + "loss": 0.485, + "step": 80200 + }, + { + "epoch": 0.8409169450524133, + "grad_norm": 2.3718228340148926, + "learning_rate": 6.362021887292578e-08, + "loss": 0.4941, + "step": 80300 + }, + { + "epoch": 0.8419641641620677, + "grad_norm": 2.2200145721435547, + "learning_rate": 6.28033073136619e-08, + "loss": 0.4928, + "step": 80400 + }, + { + "epoch": 0.843011383271722, + "grad_norm": 2.4420855045318604, + "learning_rate": 6.199132285648129e-08, + "loss": 0.515, + "step": 80500 + }, + { + "epoch": 0.8440586023813763, + "grad_norm": 2.225245714187622, + "learning_rate": 6.118427465225418e-08, + "loss": 0.5029, + "step": 80600 + }, + { + "epoch": 0.8451058214910305, + "grad_norm": 2.7253527641296387, + "learning_rate": 6.038217179622057e-08, + "loss": 0.4898, + "step": 80700 + }, + { + "epoch": 0.8461530406006849, + "grad_norm": 1.8062297105789185, + "learning_rate": 5.958502332788806e-08, + "loss": 0.5089, + "step": 80800 + }, + { + "epoch": 0.8472002597103392, + "grad_norm": 3.0290756225585938, + "learning_rate": 5.8792838230928734e-08, + "loss": 0.4988, + "step": 80900 + }, + { + "epoch": 0.8482474788199935, + "grad_norm": 2.042731523513794, + "learning_rate": 5.800562543307913e-08, + "loss": 0.493, + "step": 81000 + }, + { + "epoch": 0.8492946979296478, + "grad_norm": 2.5578713417053223, + "learning_rate": 5.722339380603908e-08, + "loss": 0.475, + "step": 81100 + }, + { + "epoch": 0.8503419170393022, + "grad_norm": 3.2866199016571045, + "learning_rate": 5.6446152165371685e-08, + "loss": 0.5102, + "step": 81200 + }, + { + "epoch": 0.8513891361489564, + "grad_norm": 2.475862979888916, + "learning_rate": 5.5673909270404495e-08, + "loss": 0.4896, + "step": 81300 + }, + { + "epoch": 0.8524363552586107, + "grad_norm": 4.128602027893066, + "learning_rate": 5.490667382412978e-08, + "loss": 0.4781, + "step": 81400 + }, + { + "epoch": 0.853483574368265, + "grad_norm": 2.8154897689819336, + "learning_rate": 5.414445447310745e-08, + "loss": 0.5034, + "step": 81500 + }, + { + "epoch": 0.8545307934779194, + "grad_norm": 2.5624399185180664, + "learning_rate": 5.338725980736736e-08, + "loss": 0.4997, + "step": 81600 + }, + { + "epoch": 0.8555780125875737, + "grad_norm": 2.6771199703216553, + "learning_rate": 5.263509836031193e-08, + "loss": 0.5214, + "step": 81700 + }, + { + "epoch": 0.856625231697228, + "grad_norm": 2.225013494491577, + "learning_rate": 5.1887978608620596e-08, + "loss": 0.4838, + "step": 81800 + }, + { + "epoch": 0.8576724508068824, + "grad_norm": 2.8142294883728027, + "learning_rate": 5.114590897215448e-08, + "loss": 0.5037, + "step": 81900 + }, + { + "epoch": 0.8587196699165366, + "grad_norm": 2.071779727935791, + "learning_rate": 5.040889781386043e-08, + "loss": 0.4689, + "step": 82000 + }, + { + "epoch": 0.8597668890261909, + "grad_norm": 2.6963651180267334, + "learning_rate": 4.9676953439677925e-08, + "loss": 0.489, + "step": 82100 + }, + { + "epoch": 0.8608141081358452, + "grad_norm": 2.4148457050323486, + "learning_rate": 4.895008409844481e-08, + "loss": 0.4816, + "step": 82200 + }, + { + "epoch": 0.8618613272454996, + "grad_norm": 2.611649513244629, + "learning_rate": 4.822829798180467e-08, + "loss": 0.5531, + "step": 82300 + }, + { + "epoch": 0.8629085463551539, + "grad_norm": 1.8031556606292725, + "learning_rate": 4.751160322411418e-08, + "loss": 0.454, + "step": 82400 + }, + { + "epoch": 0.8639557654648082, + "grad_norm": 2.0377116203308105, + "learning_rate": 4.680000790235178e-08, + "loss": 0.5212, + "step": 82500 + }, + { + "epoch": 0.8650029845744626, + "grad_norm": 1.7090651988983154, + "learning_rate": 4.609352003602646e-08, + "loss": 0.4721, + "step": 82600 + }, + { + "epoch": 0.8660502036841168, + "grad_norm": 0.9355291724205017, + "learning_rate": 4.5392147587087315e-08, + "loss": 0.4535, + "step": 82700 + }, + { + "epoch": 0.8670974227937711, + "grad_norm": 2.991403579711914, + "learning_rate": 4.4695898459834016e-08, + "loss": 0.5108, + "step": 82800 + }, + { + "epoch": 0.8681446419034254, + "grad_norm": 2.0942938327789307, + "learning_rate": 4.400478050082751e-08, + "loss": 0.4919, + "step": 82900 + }, + { + "epoch": 0.8691918610130798, + "grad_norm": 1.971248745918274, + "learning_rate": 4.331880149880179e-08, + "loss": 0.4981, + "step": 83000 + }, + { + "epoch": 0.8702390801227341, + "grad_norm": 2.0472984313964844, + "learning_rate": 4.263796918457613e-08, + "loss": 0.4663, + "step": 83100 + }, + { + "epoch": 0.8712862992323884, + "grad_norm": 2.9207637310028076, + "learning_rate": 4.196229123096762e-08, + "loss": 0.4723, + "step": 83200 + }, + { + "epoch": 0.8723335183420428, + "grad_norm": 2.6545724868774414, + "learning_rate": 4.129177525270511e-08, + "loss": 0.5042, + "step": 83300 + }, + { + "epoch": 0.873380737451697, + "grad_norm": 2.008007526397705, + "learning_rate": 4.0626428806343205e-08, + "loss": 0.4904, + "step": 83400 + }, + { + "epoch": 0.8744279565613513, + "grad_norm": 1.2464555501937866, + "learning_rate": 3.996625939017711e-08, + "loss": 0.5248, + "step": 83500 + }, + { + "epoch": 0.8754751756710056, + "grad_norm": 3.1436216831207275, + "learning_rate": 3.9311274444158106e-08, + "loss": 0.4924, + "step": 83600 + }, + { + "epoch": 0.87652239478066, + "grad_norm": 3.0234928131103516, + "learning_rate": 3.8661481349809786e-08, + "loss": 0.493, + "step": 83700 + }, + { + "epoch": 0.8775696138903143, + "grad_norm": 2.1175239086151123, + "learning_rate": 3.8016887430144754e-08, + "loss": 0.4933, + "step": 83800 + }, + { + "epoch": 0.8786168329999686, + "grad_norm": 2.497673749923706, + "learning_rate": 3.737749994958228e-08, + "loss": 0.5146, + "step": 83900 + }, + { + "epoch": 0.879664052109623, + "grad_norm": 1.5378285646438599, + "learning_rate": 3.674332611386616e-08, + "loss": 0.4628, + "step": 84000 + }, + { + "epoch": 0.8807112712192772, + "grad_norm": 3.481321334838867, + "learning_rate": 3.6114373069983885e-08, + "loss": 0.513, + "step": 84100 + }, + { + "epoch": 0.8817584903289315, + "grad_norm": 3.8998842239379883, + "learning_rate": 3.549064790608536e-08, + "loss": 0.5157, + "step": 84200 + }, + { + "epoch": 0.8828057094385858, + "grad_norm": 4.254595756530762, + "learning_rate": 3.487215765140422e-08, + "loss": 0.503, + "step": 84300 + }, + { + "epoch": 0.8838529285482402, + "grad_norm": 1.633023977279663, + "learning_rate": 3.4258909276177584e-08, + "loss": 0.4763, + "step": 84400 + }, + { + "epoch": 0.8849001476578945, + "grad_norm": 2.1271402835845947, + "learning_rate": 3.365090969156764e-08, + "loss": 0.514, + "step": 84500 + }, + { + "epoch": 0.8859473667675488, + "grad_norm": 2.325639009475708, + "learning_rate": 3.304816574958441e-08, + "loss": 0.5295, + "step": 84600 + }, + { + "epoch": 0.886994585877203, + "grad_norm": 3.336534261703491, + "learning_rate": 3.2450684243007786e-08, + "loss": 0.498, + "step": 84700 + }, + { + "epoch": 0.8880418049868574, + "grad_norm": 2.818937301635742, + "learning_rate": 3.185847190531121e-08, + "loss": 0.4621, + "step": 84800 + }, + { + "epoch": 0.8890890240965117, + "grad_norm": 2.3609235286712646, + "learning_rate": 3.1271535410586136e-08, + "loss": 0.4536, + "step": 84900 + }, + { + "epoch": 0.890136243206166, + "grad_norm": 2.134856939315796, + "learning_rate": 3.06898813734664e-08, + "loss": 0.4955, + "step": 85000 + }, + { + "epoch": 0.8911834623158204, + "grad_norm": 2.349867105484009, + "learning_rate": 3.011351634905357e-08, + "loss": 0.5, + "step": 85100 + }, + { + "epoch": 0.8922306814254747, + "grad_norm": 2.3223259449005127, + "learning_rate": 2.9542446832843793e-08, + "loss": 0.5176, + "step": 85200 + }, + { + "epoch": 0.893277900535129, + "grad_norm": 2.8934836387634277, + "learning_rate": 2.8976679260653613e-08, + "loss": 0.5069, + "step": 85300 + }, + { + "epoch": 0.8943251196447832, + "grad_norm": 2.5627784729003906, + "learning_rate": 2.8416220008548152e-08, + "loss": 0.5019, + "step": 85400 + }, + { + "epoch": 0.8953723387544376, + "grad_norm": 4.0183796882629395, + "learning_rate": 2.7861075392769275e-08, + "loss": 0.4907, + "step": 85500 + }, + { + "epoch": 0.8964195578640919, + "grad_norm": 2.2696878910064697, + "learning_rate": 2.7311251669663692e-08, + "loss": 0.4785, + "step": 85600 + }, + { + "epoch": 0.8974667769737462, + "grad_norm": 2.5743296146392822, + "learning_rate": 2.6766755035613155e-08, + "loss": 0.4707, + "step": 85700 + }, + { + "epoch": 0.8985139960834005, + "grad_norm": 2.059088945388794, + "learning_rate": 2.622759162696464e-08, + "loss": 0.5246, + "step": 85800 + }, + { + "epoch": 0.8995612151930549, + "grad_norm": 1.2305697202682495, + "learning_rate": 2.5693767519960496e-08, + "loss": 0.4841, + "step": 85900 + }, + { + "epoch": 0.9006084343027092, + "grad_norm": 3.181995153427124, + "learning_rate": 2.5165288730670585e-08, + "loss": 0.4882, + "step": 86000 + }, + { + "epoch": 0.9016556534123634, + "grad_norm": 2.311540365219116, + "learning_rate": 2.464216121492463e-08, + "loss": 0.4918, + "step": 86100 + }, + { + "epoch": 0.9027028725220178, + "grad_norm": 1.5216143131256104, + "learning_rate": 2.412439086824436e-08, + "loss": 0.4877, + "step": 86200 + }, + { + "epoch": 0.9037500916316721, + "grad_norm": 1.816412091255188, + "learning_rate": 2.361198352577759e-08, + "loss": 0.495, + "step": 86300 + }, + { + "epoch": 0.9047973107413264, + "grad_norm": 1.8467931747436523, + "learning_rate": 2.310494496223253e-08, + "loss": 0.517, + "step": 86400 + }, + { + "epoch": 0.9058445298509807, + "grad_norm": 1.95524001121521, + "learning_rate": 2.260328089181246e-08, + "loss": 0.4702, + "step": 86500 + }, + { + "epoch": 0.9068917489606351, + "grad_norm": 2.4727303981781006, + "learning_rate": 2.210699696815127e-08, + "loss": 0.498, + "step": 86600 + }, + { + "epoch": 0.9079389680702894, + "grad_norm": 3.1941773891448975, + "learning_rate": 2.1616098784250082e-08, + "loss": 0.4655, + "step": 86700 + }, + { + "epoch": 0.9089861871799436, + "grad_norm": 3.8430733680725098, + "learning_rate": 2.1130591872413837e-08, + "loss": 0.5178, + "step": 86800 + }, + { + "epoch": 0.910033406289598, + "grad_norm": 1.787541151046753, + "learning_rate": 2.0650481704189315e-08, + "loss": 0.4858, + "step": 86900 + }, + { + "epoch": 0.9110806253992523, + "grad_norm": 1.8147176504135132, + "learning_rate": 2.017577369030321e-08, + "loss": 0.4997, + "step": 87000 + }, + { + "epoch": 0.9121278445089066, + "grad_norm": 2.207904100418091, + "learning_rate": 1.9706473180601145e-08, + "loss": 0.4998, + "step": 87100 + }, + { + "epoch": 0.9131750636185609, + "grad_norm": 2.220478057861328, + "learning_rate": 1.9242585463987548e-08, + "loss": 0.4939, + "step": 87200 + }, + { + "epoch": 0.9142222827282153, + "grad_norm": 2.459459066390991, + "learning_rate": 1.878411576836597e-08, + "loss": 0.5106, + "step": 87300 + }, + { + "epoch": 0.9152695018378696, + "grad_norm": 1.8161354064941406, + "learning_rate": 1.8331069260580147e-08, + "loss": 0.4519, + "step": 87400 + }, + { + "epoch": 0.9163167209475238, + "grad_norm": 2.2104363441467285, + "learning_rate": 1.78834510463558e-08, + "loss": 0.4841, + "step": 87500 + }, + { + "epoch": 0.9173639400571781, + "grad_norm": 3.3614344596862793, + "learning_rate": 1.744126617024305e-08, + "loss": 0.4699, + "step": 87600 + }, + { + "epoch": 0.9184111591668325, + "grad_norm": 1.9489402770996094, + "learning_rate": 1.70045196155596e-08, + "loss": 0.4884, + "step": 87700 + }, + { + "epoch": 0.9194583782764868, + "grad_norm": 2.2660348415374756, + "learning_rate": 1.6573216304334615e-08, + "loss": 0.4971, + "step": 87800 + }, + { + "epoch": 0.9205055973861411, + "grad_norm": 1.9117883443832397, + "learning_rate": 1.6147361097253122e-08, + "loss": 0.5133, + "step": 87900 + }, + { + "epoch": 0.9215528164957955, + "grad_norm": 2.3087127208709717, + "learning_rate": 1.5726958793601476e-08, + "loss": 0.481, + "step": 88000 + }, + { + "epoch": 0.9226000356054497, + "grad_norm": 2.1353018283843994, + "learning_rate": 1.5312014131212914e-08, + "loss": 0.4618, + "step": 88100 + }, + { + "epoch": 0.923647254715104, + "grad_norm": 2.694920778274536, + "learning_rate": 1.4902531786414542e-08, + "loss": 0.4633, + "step": 88200 + }, + { + "epoch": 0.9246944738247583, + "grad_norm": 2.070590019226074, + "learning_rate": 1.4498516373974312e-08, + "loss": 0.5069, + "step": 88300 + }, + { + "epoch": 0.9257416929344127, + "grad_norm": 1.7129287719726562, + "learning_rate": 1.4099972447049246e-08, + "loss": 0.479, + "step": 88400 + }, + { + "epoch": 0.926788912044067, + "grad_norm": 2.0258448123931885, + "learning_rate": 1.3706904497133964e-08, + "loss": 0.5026, + "step": 88500 + }, + { + "epoch": 0.9278361311537213, + "grad_norm": 2.2771730422973633, + "learning_rate": 1.331931695401034e-08, + "loss": 0.4739, + "step": 88600 + }, + { + "epoch": 0.9288833502633757, + "grad_norm": 2.1517481803894043, + "learning_rate": 1.2937214185696988e-08, + "loss": 0.5027, + "step": 88700 + }, + { + "epoch": 0.9299305693730299, + "grad_norm": 2.0524544715881348, + "learning_rate": 1.2560600498400852e-08, + "loss": 0.459, + "step": 88800 + }, + { + "epoch": 0.9309777884826842, + "grad_norm": 2.0591094493865967, + "learning_rate": 1.2189480136467978e-08, + "loss": 0.512, + "step": 88900 + }, + { + "epoch": 0.9320250075923385, + "grad_norm": 1.7868990898132324, + "learning_rate": 1.1823857282335869e-08, + "loss": 0.4755, + "step": 89000 + }, + { + "epoch": 0.9330722267019929, + "grad_norm": 2.4516055583953857, + "learning_rate": 1.146373605648676e-08, + "loss": 0.5004, + "step": 89100 + }, + { + "epoch": 0.9341194458116472, + "grad_norm": 2.602165699005127, + "learning_rate": 1.1109120517400704e-08, + "loss": 0.5163, + "step": 89200 + }, + { + "epoch": 0.9351666649213015, + "grad_norm": 4.763970851898193, + "learning_rate": 1.076001466150972e-08, + "loss": 0.5095, + "step": 89300 + }, + { + "epoch": 0.9362138840309558, + "grad_norm": 2.463984966278076, + "learning_rate": 1.0416422423153547e-08, + "loss": 0.5034, + "step": 89400 + }, + { + "epoch": 0.9372611031406101, + "grad_norm": 2.4041192531585693, + "learning_rate": 1.0078347674534194e-08, + "loss": 0.4741, + "step": 89500 + }, + { + "epoch": 0.9383083222502644, + "grad_norm": 3.2481226921081543, + "learning_rate": 9.745794225673288e-09, + "loss": 0.5558, + "step": 89600 + }, + { + "epoch": 0.9393555413599187, + "grad_norm": 2.0538644790649414, + "learning_rate": 9.418765824368625e-09, + "loss": 0.5126, + "step": 89700 + }, + { + "epoch": 0.940402760469573, + "grad_norm": 3.1280417442321777, + "learning_rate": 9.097266156151972e-09, + "loss": 0.4813, + "step": 89800 + }, + { + "epoch": 0.9414499795792274, + "grad_norm": 2.6181859970092773, + "learning_rate": 8.781298844247608e-09, + "loss": 0.4985, + "step": 89900 + }, + { + "epoch": 0.9424971986888817, + "grad_norm": 2.8424460887908936, + "learning_rate": 8.470867449531627e-09, + "loss": 0.5032, + "step": 90000 + }, + { + "epoch": 0.943544417798536, + "grad_norm": 1.8021912574768066, + "learning_rate": 8.165975470491416e-09, + "loss": 0.5082, + "step": 90100 + }, + { + "epoch": 0.9445916369081903, + "grad_norm": 2.1348044872283936, + "learning_rate": 7.866626343186577e-09, + "loss": 0.4811, + "step": 90200 + }, + { + "epoch": 0.9456388560178446, + "grad_norm": 1.665382981300354, + "learning_rate": 7.572823441210353e-09, + "loss": 0.5137, + "step": 90300 + }, + { + "epoch": 0.9466860751274989, + "grad_norm": 1.782528281211853, + "learning_rate": 7.284570075650864e-09, + "loss": 0.4861, + "step": 90400 + }, + { + "epoch": 0.9477332942371532, + "grad_norm": 2.0802054405212402, + "learning_rate": 7.001869495054713e-09, + "loss": 0.5201, + "step": 90500 + }, + { + "epoch": 0.9487805133468076, + "grad_norm": 2.515943765640259, + "learning_rate": 6.724724885389721e-09, + "loss": 0.4863, + "step": 90600 + }, + { + "epoch": 0.9498277324564619, + "grad_norm": 1.7922004461288452, + "learning_rate": 6.4531393700092415e-09, + "loss": 0.4858, + "step": 90700 + }, + { + "epoch": 0.9508749515661162, + "grad_norm": 1.5402792692184448, + "learning_rate": 6.187116009617188e-09, + "loss": 0.5174, + "step": 90800 + }, + { + "epoch": 0.9519221706757705, + "grad_norm": 2.370882987976074, + "learning_rate": 5.926657802233004e-09, + "loss": 0.5299, + "step": 90900 + }, + { + "epoch": 0.9529693897854248, + "grad_norm": 2.1812610626220703, + "learning_rate": 5.671767683158357e-09, + "loss": 0.5078, + "step": 91000 + }, + { + "epoch": 0.9540166088950791, + "grad_norm": 1.9076416492462158, + "learning_rate": 5.422448524944057e-09, + "loss": 0.4871, + "step": 91100 + }, + { + "epoch": 0.9550638280047334, + "grad_norm": 2.5718798637390137, + "learning_rate": 5.1787031373571326e-09, + "loss": 0.5, + "step": 91200 + }, + { + "epoch": 0.9561110471143878, + "grad_norm": 1.7200427055358887, + "learning_rate": 4.940534267349861e-09, + "loss": 0.4824, + "step": 91300 + }, + { + "epoch": 0.9571582662240421, + "grad_norm": 2.0528995990753174, + "learning_rate": 4.7079445990284015e-09, + "loss": 0.4893, + "step": 91400 + }, + { + "epoch": 0.9582054853336963, + "grad_norm": 2.170036554336548, + "learning_rate": 4.4809367536226e-09, + "loss": 0.5468, + "step": 91500 + }, + { + "epoch": 0.9592527044433506, + "grad_norm": 2.4191830158233643, + "learning_rate": 4.2595132894565625e-09, + "loss": 0.496, + "step": 91600 + }, + { + "epoch": 0.960299923553005, + "grad_norm": 3.8748281002044678, + "learning_rate": 4.043676701919741e-09, + "loss": 0.52, + "step": 91700 + }, + { + "epoch": 0.9613471426626593, + "grad_norm": 2.9865217208862305, + "learning_rate": 3.833429423438838e-09, + "loss": 0.4729, + "step": 91800 + }, + { + "epoch": 0.9623943617723136, + "grad_norm": 3.5876505374908447, + "learning_rate": 3.628773823450337e-09, + "loss": 0.4557, + "step": 91900 + }, + { + "epoch": 0.963441580881968, + "grad_norm": 2.007694959640503, + "learning_rate": 3.429712208373847e-09, + "loss": 0.5197, + "step": 92000 + }, + { + "epoch": 0.9644887999916223, + "grad_norm": 1.564520239830017, + "learning_rate": 3.2362468215861306e-09, + "loss": 0.4519, + "step": 92100 + }, + { + "epoch": 0.9655360191012765, + "grad_norm": 2.6633753776550293, + "learning_rate": 3.0483798433957876e-09, + "loss": 0.5247, + "step": 92200 + }, + { + "epoch": 0.9665832382109308, + "grad_norm": 2.7909083366394043, + "learning_rate": 2.8661133910187206e-09, + "loss": 0.4981, + "step": 92300 + }, + { + "epoch": 0.9676304573205852, + "grad_norm": 2.7965500354766846, + "learning_rate": 2.68944951855421e-09, + "loss": 0.4982, + "step": 92400 + }, + { + "epoch": 0.9686776764302395, + "grad_norm": 2.164356231689453, + "learning_rate": 2.5183902169618187e-09, + "loss": 0.4926, + "step": 92500 + }, + { + "epoch": 0.9697248955398938, + "grad_norm": 2.378080368041992, + "learning_rate": 2.352937414038969e-09, + "loss": 0.4796, + "step": 92600 + }, + { + "epoch": 0.9707721146495482, + "grad_norm": 2.3100953102111816, + "learning_rate": 2.1930929743990136e-09, + "loss": 0.511, + "step": 92700 + }, + { + "epoch": 0.9718193337592025, + "grad_norm": 1.154026985168457, + "learning_rate": 2.0388586994506964e-09, + "loss": 0.5297, + "step": 92800 + }, + { + "epoch": 0.9728665528688567, + "grad_norm": 2.432117462158203, + "learning_rate": 1.8902363273772815e-09, + "loss": 0.4869, + "step": 92900 + }, + { + "epoch": 0.973913771978511, + "grad_norm": 2.1382997035980225, + "learning_rate": 1.7472275331173459e-09, + "loss": 0.5253, + "step": 93000 + }, + { + "epoch": 0.9749609910881654, + "grad_norm": 2.517921209335327, + "learning_rate": 1.609833928345794e-09, + "loss": 0.4989, + "step": 93100 + }, + { + "epoch": 0.9760082101978197, + "grad_norm": 2.1486592292785645, + "learning_rate": 1.4780570614556508e-09, + "loss": 0.5392, + "step": 93200 + }, + { + "epoch": 0.977055429307474, + "grad_norm": 2.8666563034057617, + "learning_rate": 1.3518984175406312e-09, + "loss": 0.4899, + "step": 93300 + }, + { + "epoch": 0.9781026484171284, + "grad_norm": 2.0608692169189453, + "learning_rate": 1.231359418378486e-09, + "loss": 0.5013, + "step": 93400 + }, + { + "epoch": 0.9791498675267827, + "grad_norm": 2.5256223678588867, + "learning_rate": 1.1164414224149598e-09, + "loss": 0.506, + "step": 93500 + }, + { + "epoch": 0.9801970866364369, + "grad_norm": 1.9714406728744507, + "learning_rate": 1.0071457247482485e-09, + "loss": 0.5306, + "step": 93600 + }, + { + "epoch": 0.9812443057460912, + "grad_norm": 2.5823991298675537, + "learning_rate": 9.034735571147312e-10, + "loss": 0.4887, + "step": 93700 + }, + { + "epoch": 0.9822915248557456, + "grad_norm": 2.48111891746521, + "learning_rate": 8.054260878749275e-10, + "loss": 0.5309, + "step": 93800 + }, + { + "epoch": 0.9833387439653999, + "grad_norm": 3.824676752090454, + "learning_rate": 7.130044220003962e-10, + "loss": 0.4919, + "step": 93900 + }, + { + "epoch": 0.9843859630750542, + "grad_norm": 2.073537588119507, + "learning_rate": 6.26209601061134e-10, + "loss": 0.4679, + "step": 94000 + }, + { + "epoch": 0.9854331821847085, + "grad_norm": 2.32852840423584, + "learning_rate": 5.450426032140298e-10, + "loss": 0.4893, + "step": 94100 + }, + { + "epoch": 0.9864804012943629, + "grad_norm": 3.0331838130950928, + "learning_rate": 4.695043431917068e-10, + "loss": 0.4837, + "step": 94200 + }, + { + "epoch": 0.9875276204040171, + "grad_norm": 2.3463919162750244, + "learning_rate": 3.995956722922522e-10, + "loss": 0.4748, + "step": 94300 + }, + { + "epoch": 0.9885748395136714, + "grad_norm": 3.0472140312194824, + "learning_rate": 3.3531737836967054e-10, + "loss": 0.5212, + "step": 94400 + }, + { + "epoch": 0.9896220586233258, + "grad_norm": 1.4455373287200928, + "learning_rate": 2.766701858250009e-10, + "loss": 0.4858, + "step": 94500 + }, + { + "epoch": 0.9906692777329801, + "grad_norm": 2.5533838272094727, + "learning_rate": 2.2365475559799064e-10, + "loss": 0.5016, + "step": 94600 + }, + { + "epoch": 0.9917164968426344, + "grad_norm": 2.4406557083129883, + "learning_rate": 1.762716851599344e-10, + "loss": 0.4551, + "step": 94700 + }, + { + "epoch": 0.9927637159522887, + "grad_norm": 2.5848546028137207, + "learning_rate": 1.3452150850656872e-10, + "loss": 0.4797, + "step": 94800 + }, + { + "epoch": 0.993810935061943, + "grad_norm": 2.0372912883758545, + "learning_rate": 9.84046961525209e-11, + "loss": 0.4646, + "step": 94900 + }, + { + "epoch": 0.9948581541715973, + "grad_norm": 2.8523876667022705, + "learning_rate": 6.792165512553571e-11, + "loss": 0.4876, + "step": 95000 + }, + { + "epoch": 0.9959053732812516, + "grad_norm": 2.202986001968384, + "learning_rate": 4.3072728962256774e-11, + "loss": 0.5156, + "step": 95100 + }, + { + "epoch": 0.996952592390906, + "grad_norm": 2.1548354625701904, + "learning_rate": 2.3858197704063055e-11, + "loss": 0.5241, + "step": 95200 + }, + { + "epoch": 0.9979998115005603, + "grad_norm": 1.8615128993988037, + "learning_rate": 1.0278277894182342e-11, + "loss": 0.4658, + "step": 95300 + }, + { + "epoch": 0.9990470306102146, + "grad_norm": 2.989764928817749, + "learning_rate": 2.3331225750267137e-12, + "loss": 0.5486, + "step": 95400 + } + ], + "logging_steps": 100, + "max_steps": 95491, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 6.299285301826683e+17, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}