{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.99576743149922, "eval_steps": 500, "global_step": 1050, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028514145689463134, "grad_norm": 59.78074645996094, "learning_rate": 9.374999999999999e-05, "loss": 0.7842, "step": 10 }, { "epoch": 0.05702829137892627, "grad_norm": 34.404449462890625, "learning_rate": 0.00018749999999999998, "loss": 0.7352, "step": 20 }, { "epoch": 0.08554243706838939, "grad_norm": 32.21347427368164, "learning_rate": 0.00028125, "loss": 0.7214, "step": 30 }, { "epoch": 0.11405658275785253, "grad_norm": 58.45460891723633, "learning_rate": 0.00029995428872157097, "loss": 0.7276, "step": 40 }, { "epoch": 0.14257072844731566, "grad_norm": 34.381004333496094, "learning_rate": 0.00029976863440074164, "loss": 0.7381, "step": 50 }, { "epoch": 0.17108487413677878, "grad_norm": 23.00215721130371, "learning_rate": 0.0002994403567435186, "loss": 0.696, "step": 60 }, { "epoch": 0.19959901982624192, "grad_norm": 28.15259552001953, "learning_rate": 0.0002989697683657906, "loss": 0.6695, "step": 70 }, { "epoch": 0.22811316551570507, "grad_norm": 23.132097244262695, "learning_rate": 0.0002983573174046776, "loss": 0.6745, "step": 80 }, { "epoch": 0.2566273112051682, "grad_norm": 26.72660255432129, "learning_rate": 0.00029760358709177425, "loss": 0.6742, "step": 90 }, { "epoch": 0.28514145689463133, "grad_norm": 21.0267391204834, "learning_rate": 0.00029670929519774324, "loss": 0.6797, "step": 100 }, { "epoch": 0.3136556025840945, "grad_norm": 24.07339096069336, "learning_rate": 0.0002956752933487888, "loss": 0.6582, "step": 110 }, { "epoch": 0.34216974827355756, "grad_norm": 32.077362060546875, "learning_rate": 0.00029450256621566076, "loss": 0.6531, "step": 120 }, { "epoch": 0.3706838939630207, "grad_norm": 23.832252502441406, "learning_rate": 0.0002931922305759614, "loss": 0.6584, "step": 130 }, { "epoch": 0.39919803965248385, "grad_norm": 21.751239776611328, "learning_rate": 0.00029174553425064773, "loss": 0.6557, "step": 140 }, { "epoch": 0.427712185341947, "grad_norm": 24.70648956298828, "learning_rate": 0.00029016385491574314, "loss": 0.6376, "step": 150 }, { "epoch": 0.45622633103141014, "grad_norm": 23.676149368286133, "learning_rate": 0.00028844869879038863, "loss": 0.6424, "step": 160 }, { "epoch": 0.4847404767208732, "grad_norm": 26.275875091552734, "learning_rate": 0.0002866016992024837, "loss": 0.633, "step": 170 }, { "epoch": 0.5132546224103364, "grad_norm": 23.403223037719727, "learning_rate": 0.0002846246150332827, "loss": 0.6203, "step": 180 }, { "epoch": 0.5417687680997995, "grad_norm": 23.481271743774414, "learning_rate": 0.000282519329042428, "loss": 0.6211, "step": 190 }, { "epoch": 0.5702829137892627, "grad_norm": 20.71142578125, "learning_rate": 0.00028028784607501473, "loss": 0.6232, "step": 200 }, { "epoch": 0.5987970594787257, "grad_norm": 20.313270568847656, "learning_rate": 0.00027793229115239456, "loss": 0.6035, "step": 210 }, { "epoch": 0.627311205168189, "grad_norm": 15.384033203125, "learning_rate": 0.0002754549074485369, "loss": 0.6082, "step": 220 }, { "epoch": 0.655825350857652, "grad_norm": 20.352094650268555, "learning_rate": 0.0002728580541538743, "loss": 0.6045, "step": 230 }, { "epoch": 0.6843394965471151, "grad_norm": 20.499040603637695, "learning_rate": 0.0002701442042286665, "loss": 0.6077, "step": 240 }, { "epoch": 0.7128536422365783, "grad_norm": 19.507705688476562, "learning_rate": 0.000267315942048022, "loss": 0.6052, "step": 250 }, { "epoch": 0.7413677879260414, "grad_norm": 23.443056106567383, "learning_rate": 0.0002643759609408212, "loss": 0.5917, "step": 260 }, { "epoch": 0.7698819336155046, "grad_norm": 22.090147018432617, "learning_rate": 0.00026132706062488294, "loss": 0.596, "step": 270 }, { "epoch": 0.7983960793049677, "grad_norm": 22.015439987182617, "learning_rate": 0.0002581721445408184, "loss": 0.5923, "step": 280 }, { "epoch": 0.8269102249944308, "grad_norm": 19.542490005493164, "learning_rate": 0.0002549142170871103, "loss": 0.5955, "step": 290 }, { "epoch": 0.855424370683894, "grad_norm": 17.32285499572754, "learning_rate": 0.00025155638075905097, "loss": 0.566, "step": 300 }, { "epoch": 0.8839385163733571, "grad_norm": 18.898284912109375, "learning_rate": 0.00024810183319426394, "loss": 0.5677, "step": 310 }, { "epoch": 0.9124526620628203, "grad_norm": 16.297840118408203, "learning_rate": 0.00024455386412762184, "loss": 0.577, "step": 320 }, { "epoch": 0.9409668077522834, "grad_norm": 22.982707977294922, "learning_rate": 0.00024091585225846125, "loss": 0.57, "step": 330 }, { "epoch": 0.9694809534417465, "grad_norm": 20.184415817260742, "learning_rate": 0.00023719126203307778, "loss": 0.5743, "step": 340 }, { "epoch": 0.9979950991312097, "grad_norm": 16.89832878112793, "learning_rate": 0.00023338364034556413, "loss": 0.5663, "step": 350 }, { "epoch": 1.0274003118734685, "grad_norm": 16.95356559753418, "learning_rate": 0.00022949661316013482, "loss": 0.4709, "step": 360 }, { "epoch": 1.0559144575629316, "grad_norm": 20.939350128173828, "learning_rate": 0.0002255338820581528, "loss": 0.4702, "step": 370 }, { "epoch": 1.0844286032523947, "grad_norm": 19.86914825439453, "learning_rate": 0.0002214992207131462, "loss": 0.4728, "step": 380 }, { "epoch": 1.112942748941858, "grad_norm": 19.765581130981445, "learning_rate": 0.0002173964712971729, "loss": 0.4664, "step": 390 }, { "epoch": 1.141456894631321, "grad_norm": 16.14029884338379, "learning_rate": 0.00021322954082195433, "loss": 0.4696, "step": 400 }, { "epoch": 1.1699710403207841, "grad_norm": 17.055089950561523, "learning_rate": 0.00020900239741826278, "loss": 0.4717, "step": 410 }, { "epoch": 1.1984851860102472, "grad_norm": 14.829668045043945, "learning_rate": 0.00020471906655710603, "loss": 0.4716, "step": 420 }, { "epoch": 1.2269993316997103, "grad_norm": 15.102470397949219, "learning_rate": 0.00020038362721630696, "loss": 0.4597, "step": 430 }, { "epoch": 1.2555134773891736, "grad_norm": 19.483240127563477, "learning_rate": 0.00019600020799612964, "loss": 0.4582, "step": 440 }, { "epoch": 1.2840276230786367, "grad_norm": 20.06715202331543, "learning_rate": 0.00019157298318764958, "loss": 0.4564, "step": 450 }, { "epoch": 1.3125417687680998, "grad_norm": 16.547321319580078, "learning_rate": 0.00018710616879761405, "loss": 0.4572, "step": 460 }, { "epoch": 1.341055914457563, "grad_norm": 15.825061798095703, "learning_rate": 0.0001826040185335761, "loss": 0.468, "step": 470 }, { "epoch": 1.369570060147026, "grad_norm": 15.663127899169922, "learning_rate": 0.0001780708197531268, "loss": 0.4525, "step": 480 }, { "epoch": 1.398084205836489, "grad_norm": 14.849474906921387, "learning_rate": 0.00017351088938108276, "loss": 0.4561, "step": 490 }, { "epoch": 1.4265983515259524, "grad_norm": 15.581180572509766, "learning_rate": 0.00016892856979851725, "loss": 0.4603, "step": 500 }, { "epoch": 1.4551124972154155, "grad_norm": 14.99488353729248, "learning_rate": 0.00016432822470754922, "loss": 0.446, "step": 510 }, { "epoch": 1.4836266429048786, "grad_norm": 16.289323806762695, "learning_rate": 0.00015971423497582873, "loss": 0.4534, "step": 520 }, { "epoch": 1.5121407885943419, "grad_norm": 15.169504165649414, "learning_rate": 0.00015509099446467557, "loss": 0.4502, "step": 530 }, { "epoch": 1.540654934283805, "grad_norm": 13.88201904296875, "learning_rate": 0.00015046290584484455, "loss": 0.4563, "step": 540 }, { "epoch": 1.569169079973268, "grad_norm": 14.23528003692627, "learning_rate": 0.00014583437640390112, "loss": 0.4303, "step": 550 }, { "epoch": 1.5976832256627311, "grad_norm": 13.917679786682129, "learning_rate": 0.00014120981384920065, "loss": 0.4396, "step": 560 }, { "epoch": 1.6261973713521942, "grad_norm": 15.415020942687988, "learning_rate": 0.0001365936221104682, "loss": 0.4486, "step": 570 }, { "epoch": 1.6547115170416573, "grad_norm": 13.339295387268066, "learning_rate": 0.00013199019714597526, "loss": 0.4303, "step": 580 }, { "epoch": 1.6832256627311204, "grad_norm": 15.275891304016113, "learning_rate": 0.00012740392275630802, "loss": 0.4434, "step": 590 }, { "epoch": 1.7117398084205835, "grad_norm": 15.294014930725098, "learning_rate": 0.00012283916640971304, "loss": 0.4352, "step": 600 }, { "epoch": 1.7402539541100468, "grad_norm": 13.472885131835938, "learning_rate": 0.00011830027508299607, "loss": 0.4311, "step": 610 }, { "epoch": 1.7687680997995099, "grad_norm": 13.640401840209961, "learning_rate": 0.00011379157112193487, "loss": 0.4192, "step": 620 }, { "epoch": 1.7972822454889732, "grad_norm": 14.599030494689941, "learning_rate": 0.00010931734812514786, "loss": 0.4349, "step": 630 }, { "epoch": 1.8257963911784363, "grad_norm": 13.727120399475098, "learning_rate": 0.00010488186685533828, "loss": 0.4348, "step": 640 }, { "epoch": 1.8543105368678994, "grad_norm": 14.04608154296875, "learning_rate": 0.00010048935118180787, "loss": 0.428, "step": 650 }, { "epoch": 1.8828246825573625, "grad_norm": 14.668761253356934, "learning_rate": 9.614398405810378e-05, "loss": 0.4271, "step": 660 }, { "epoch": 1.9113388282468255, "grad_norm": 15.308387756347656, "learning_rate": 9.18499035386292e-05, "loss": 0.4153, "step": 670 }, { "epoch": 1.9398529739362886, "grad_norm": 14.18338680267334, "learning_rate": 8.761119883801097e-05, "loss": 0.4237, "step": 680 }, { "epoch": 1.9683671196257517, "grad_norm": 15.16002368927002, "learning_rate": 8.343190643697685e-05, "loss": 0.4097, "step": 690 }, { "epoch": 1.9968812653152148, "grad_norm": 12.727019309997559, "learning_rate": 7.931600623845105e-05, "loss": 0.4035, "step": 700 }, { "epoch": 2.026286478057474, "grad_norm": 16.409337997436523, "learning_rate": 7.526741777752797e-05, "loss": 0.2883, "step": 710 }, { "epoch": 2.054800623746937, "grad_norm": 14.678768157958984, "learning_rate": 7.128999648893393e-05, "loss": 0.2779, "step": 720 }, { "epoch": 2.0833147694364, "grad_norm": 15.050424575805664, "learning_rate": 6.738753003553106e-05, "loss": 0.2766, "step": 730 }, { "epoch": 2.111828915125863, "grad_norm": 13.391814231872559, "learning_rate": 6.356373470135943e-05, "loss": 0.2557, "step": 740 }, { "epoch": 2.140343060815326, "grad_norm": 12.664278984069824, "learning_rate": 5.982225185265335e-05, "loss": 0.2649, "step": 750 }, { "epoch": 2.1688572065047893, "grad_norm": 12.713603019714355, "learning_rate": 5.61666444702003e-05, "loss": 0.2607, "step": 760 }, { "epoch": 2.197371352194253, "grad_norm": 14.50146770477295, "learning_rate": 5.260039375634626e-05, "loss": 0.2569, "step": 770 }, { "epoch": 2.225885497883716, "grad_norm": 14.117573738098145, "learning_rate": 4.91268958198777e-05, "loss": 0.2528, "step": 780 }, { "epoch": 2.254399643573179, "grad_norm": 13.352962493896484, "learning_rate": 4.5749458441937426e-05, "loss": 0.2594, "step": 790 }, { "epoch": 2.282913789262642, "grad_norm": 14.24978256225586, "learning_rate": 4.24712979260541e-05, "loss": 0.246, "step": 800 }, { "epoch": 2.311427934952105, "grad_norm": 14.509572982788086, "learning_rate": 3.9295536035284975e-05, "loss": 0.2456, "step": 810 }, { "epoch": 2.3399420806415683, "grad_norm": 13.541816711425781, "learning_rate": 3.622519701938879e-05, "loss": 0.2596, "step": 820 }, { "epoch": 2.3684562263310314, "grad_norm": 13.303231239318848, "learning_rate": 3.326320473485965e-05, "loss": 0.2478, "step": 830 }, { "epoch": 2.3969703720204945, "grad_norm": 13.914246559143066, "learning_rate": 3.0412379860564546e-05, "loss": 0.2471, "step": 840 }, { "epoch": 2.4254845177099575, "grad_norm": 13.636366844177246, "learning_rate": 2.7675437211635994e-05, "loss": 0.244, "step": 850 }, { "epoch": 2.4539986633994206, "grad_norm": 13.738758087158203, "learning_rate": 2.505498315417775e-05, "loss": 0.25, "step": 860 }, { "epoch": 2.4825128090888837, "grad_norm": 12.734477043151855, "learning_rate": 2.2553513123245593e-05, "loss": 0.2514, "step": 870 }, { "epoch": 2.5110269547783473, "grad_norm": 13.232802391052246, "learning_rate": 2.017340924646676e-05, "loss": 0.2517, "step": 880 }, { "epoch": 2.5395411004678103, "grad_norm": 12.584091186523438, "learning_rate": 1.791693807556106e-05, "loss": 0.2423, "step": 890 }, { "epoch": 2.5680552461572734, "grad_norm": 13.133004188537598, "learning_rate": 1.5786248427923765e-05, "loss": 0.2397, "step": 900 }, { "epoch": 2.5965693918467365, "grad_norm": 12.750874519348145, "learning_rate": 1.3783369340326038e-05, "loss": 0.2402, "step": 910 }, { "epoch": 2.6250835375361996, "grad_norm": 14.99782657623291, "learning_rate": 1.191020813668126e-05, "loss": 0.2325, "step": 920 }, { "epoch": 2.6535976832256627, "grad_norm": 13.00296401977539, "learning_rate": 1.0168548611717453e-05, "loss": 0.2447, "step": 930 }, { "epoch": 2.682111828915126, "grad_norm": 12.952726364135742, "learning_rate": 8.560049332285445e-06, "loss": 0.2324, "step": 940 }, { "epoch": 2.710625974604589, "grad_norm": 12.093839645385742, "learning_rate": 7.086242057920466e-06, "loss": 0.2374, "step": 950 }, { "epoch": 2.739140120294052, "grad_norm": 13.643256187438965, "learning_rate": 5.748530282161151e-06, "loss": 0.2375, "step": 960 }, { "epoch": 2.7676542659835155, "grad_norm": 12.18138599395752, "learning_rate": 4.548187896015132e-06, "loss": 0.2398, "step": 970 }, { "epoch": 2.796168411672978, "grad_norm": 13.081153869628906, "learning_rate": 3.4863579748440395e-06, "loss": 0.2414, "step": 980 }, { "epoch": 2.8246825573624417, "grad_norm": 12.803534507751465, "learning_rate": 2.5640516898229824e-06, "loss": 0.2353, "step": 990 }, { "epoch": 2.8531967030519048, "grad_norm": 14.616987228393555, "learning_rate": 1.7821473450112257e-06, "loss": 0.2387, "step": 1000 }, { "epoch": 2.881710848741368, "grad_norm": 13.588176727294922, "learning_rate": 1.1413895409510932e-06, "loss": 0.2393, "step": 1010 }, { "epoch": 2.910224994430831, "grad_norm": 13.362626075744629, "learning_rate": 6.423884655915035e-07, "loss": 0.2452, "step": 1020 }, { "epoch": 2.938739140120294, "grad_norm": 13.089447021484375, "learning_rate": 2.85619313211366e-07, "loss": 0.2442, "step": 1030 }, { "epoch": 2.967253285809757, "grad_norm": 14.710633277893066, "learning_rate": 7.142183189641215e-08, "loss": 0.2395, "step": 1040 }, { "epoch": 2.99576743149922, "grad_norm": 13.993697166442871, "learning_rate": 0.0, "loss": 0.2403, "step": 1050 }, { "epoch": 2.99576743149922, "step": 1050, "total_flos": 1.2082385668508221e+18, "train_loss": 0.4443624080930437, "train_runtime": 23649.7754, "train_samples_per_second": 5.694, "train_steps_per_second": 0.044 } ], "logging_steps": 10, "max_steps": 1050, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2082385668508221e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }