KareemElzeky's picture
Upload folder using huggingface_hub
d832a27 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.99576743149922,
"eval_steps": 500,
"global_step": 1050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.028514145689463134,
"grad_norm": 59.78074645996094,
"learning_rate": 9.374999999999999e-05,
"loss": 0.7842,
"step": 10
},
{
"epoch": 0.05702829137892627,
"grad_norm": 34.404449462890625,
"learning_rate": 0.00018749999999999998,
"loss": 0.7352,
"step": 20
},
{
"epoch": 0.08554243706838939,
"grad_norm": 32.21347427368164,
"learning_rate": 0.00028125,
"loss": 0.7214,
"step": 30
},
{
"epoch": 0.11405658275785253,
"grad_norm": 58.45460891723633,
"learning_rate": 0.00029995428872157097,
"loss": 0.7276,
"step": 40
},
{
"epoch": 0.14257072844731566,
"grad_norm": 34.381004333496094,
"learning_rate": 0.00029976863440074164,
"loss": 0.7381,
"step": 50
},
{
"epoch": 0.17108487413677878,
"grad_norm": 23.00215721130371,
"learning_rate": 0.0002994403567435186,
"loss": 0.696,
"step": 60
},
{
"epoch": 0.19959901982624192,
"grad_norm": 28.15259552001953,
"learning_rate": 0.0002989697683657906,
"loss": 0.6695,
"step": 70
},
{
"epoch": 0.22811316551570507,
"grad_norm": 23.132097244262695,
"learning_rate": 0.0002983573174046776,
"loss": 0.6745,
"step": 80
},
{
"epoch": 0.2566273112051682,
"grad_norm": 26.72660255432129,
"learning_rate": 0.00029760358709177425,
"loss": 0.6742,
"step": 90
},
{
"epoch": 0.28514145689463133,
"grad_norm": 21.0267391204834,
"learning_rate": 0.00029670929519774324,
"loss": 0.6797,
"step": 100
},
{
"epoch": 0.3136556025840945,
"grad_norm": 24.07339096069336,
"learning_rate": 0.0002956752933487888,
"loss": 0.6582,
"step": 110
},
{
"epoch": 0.34216974827355756,
"grad_norm": 32.077362060546875,
"learning_rate": 0.00029450256621566076,
"loss": 0.6531,
"step": 120
},
{
"epoch": 0.3706838939630207,
"grad_norm": 23.832252502441406,
"learning_rate": 0.0002931922305759614,
"loss": 0.6584,
"step": 130
},
{
"epoch": 0.39919803965248385,
"grad_norm": 21.751239776611328,
"learning_rate": 0.00029174553425064773,
"loss": 0.6557,
"step": 140
},
{
"epoch": 0.427712185341947,
"grad_norm": 24.70648956298828,
"learning_rate": 0.00029016385491574314,
"loss": 0.6376,
"step": 150
},
{
"epoch": 0.45622633103141014,
"grad_norm": 23.676149368286133,
"learning_rate": 0.00028844869879038863,
"loss": 0.6424,
"step": 160
},
{
"epoch": 0.4847404767208732,
"grad_norm": 26.275875091552734,
"learning_rate": 0.0002866016992024837,
"loss": 0.633,
"step": 170
},
{
"epoch": 0.5132546224103364,
"grad_norm": 23.403223037719727,
"learning_rate": 0.0002846246150332827,
"loss": 0.6203,
"step": 180
},
{
"epoch": 0.5417687680997995,
"grad_norm": 23.481271743774414,
"learning_rate": 0.000282519329042428,
"loss": 0.6211,
"step": 190
},
{
"epoch": 0.5702829137892627,
"grad_norm": 20.71142578125,
"learning_rate": 0.00028028784607501473,
"loss": 0.6232,
"step": 200
},
{
"epoch": 0.5987970594787257,
"grad_norm": 20.313270568847656,
"learning_rate": 0.00027793229115239456,
"loss": 0.6035,
"step": 210
},
{
"epoch": 0.627311205168189,
"grad_norm": 15.384033203125,
"learning_rate": 0.0002754549074485369,
"loss": 0.6082,
"step": 220
},
{
"epoch": 0.655825350857652,
"grad_norm": 20.352094650268555,
"learning_rate": 0.0002728580541538743,
"loss": 0.6045,
"step": 230
},
{
"epoch": 0.6843394965471151,
"grad_norm": 20.499040603637695,
"learning_rate": 0.0002701442042286665,
"loss": 0.6077,
"step": 240
},
{
"epoch": 0.7128536422365783,
"grad_norm": 19.507705688476562,
"learning_rate": 0.000267315942048022,
"loss": 0.6052,
"step": 250
},
{
"epoch": 0.7413677879260414,
"grad_norm": 23.443056106567383,
"learning_rate": 0.0002643759609408212,
"loss": 0.5917,
"step": 260
},
{
"epoch": 0.7698819336155046,
"grad_norm": 22.090147018432617,
"learning_rate": 0.00026132706062488294,
"loss": 0.596,
"step": 270
},
{
"epoch": 0.7983960793049677,
"grad_norm": 22.015439987182617,
"learning_rate": 0.0002581721445408184,
"loss": 0.5923,
"step": 280
},
{
"epoch": 0.8269102249944308,
"grad_norm": 19.542490005493164,
"learning_rate": 0.0002549142170871103,
"loss": 0.5955,
"step": 290
},
{
"epoch": 0.855424370683894,
"grad_norm": 17.32285499572754,
"learning_rate": 0.00025155638075905097,
"loss": 0.566,
"step": 300
},
{
"epoch": 0.8839385163733571,
"grad_norm": 18.898284912109375,
"learning_rate": 0.00024810183319426394,
"loss": 0.5677,
"step": 310
},
{
"epoch": 0.9124526620628203,
"grad_norm": 16.297840118408203,
"learning_rate": 0.00024455386412762184,
"loss": 0.577,
"step": 320
},
{
"epoch": 0.9409668077522834,
"grad_norm": 22.982707977294922,
"learning_rate": 0.00024091585225846125,
"loss": 0.57,
"step": 330
},
{
"epoch": 0.9694809534417465,
"grad_norm": 20.184415817260742,
"learning_rate": 0.00023719126203307778,
"loss": 0.5743,
"step": 340
},
{
"epoch": 0.9979950991312097,
"grad_norm": 16.89832878112793,
"learning_rate": 0.00023338364034556413,
"loss": 0.5663,
"step": 350
},
{
"epoch": 1.0274003118734685,
"grad_norm": 16.95356559753418,
"learning_rate": 0.00022949661316013482,
"loss": 0.4709,
"step": 360
},
{
"epoch": 1.0559144575629316,
"grad_norm": 20.939350128173828,
"learning_rate": 0.0002255338820581528,
"loss": 0.4702,
"step": 370
},
{
"epoch": 1.0844286032523947,
"grad_norm": 19.86914825439453,
"learning_rate": 0.0002214992207131462,
"loss": 0.4728,
"step": 380
},
{
"epoch": 1.112942748941858,
"grad_norm": 19.765581130981445,
"learning_rate": 0.0002173964712971729,
"loss": 0.4664,
"step": 390
},
{
"epoch": 1.141456894631321,
"grad_norm": 16.14029884338379,
"learning_rate": 0.00021322954082195433,
"loss": 0.4696,
"step": 400
},
{
"epoch": 1.1699710403207841,
"grad_norm": 17.055089950561523,
"learning_rate": 0.00020900239741826278,
"loss": 0.4717,
"step": 410
},
{
"epoch": 1.1984851860102472,
"grad_norm": 14.829668045043945,
"learning_rate": 0.00020471906655710603,
"loss": 0.4716,
"step": 420
},
{
"epoch": 1.2269993316997103,
"grad_norm": 15.102470397949219,
"learning_rate": 0.00020038362721630696,
"loss": 0.4597,
"step": 430
},
{
"epoch": 1.2555134773891736,
"grad_norm": 19.483240127563477,
"learning_rate": 0.00019600020799612964,
"loss": 0.4582,
"step": 440
},
{
"epoch": 1.2840276230786367,
"grad_norm": 20.06715202331543,
"learning_rate": 0.00019157298318764958,
"loss": 0.4564,
"step": 450
},
{
"epoch": 1.3125417687680998,
"grad_norm": 16.547321319580078,
"learning_rate": 0.00018710616879761405,
"loss": 0.4572,
"step": 460
},
{
"epoch": 1.341055914457563,
"grad_norm": 15.825061798095703,
"learning_rate": 0.0001826040185335761,
"loss": 0.468,
"step": 470
},
{
"epoch": 1.369570060147026,
"grad_norm": 15.663127899169922,
"learning_rate": 0.0001780708197531268,
"loss": 0.4525,
"step": 480
},
{
"epoch": 1.398084205836489,
"grad_norm": 14.849474906921387,
"learning_rate": 0.00017351088938108276,
"loss": 0.4561,
"step": 490
},
{
"epoch": 1.4265983515259524,
"grad_norm": 15.581180572509766,
"learning_rate": 0.00016892856979851725,
"loss": 0.4603,
"step": 500
},
{
"epoch": 1.4551124972154155,
"grad_norm": 14.99488353729248,
"learning_rate": 0.00016432822470754922,
"loss": 0.446,
"step": 510
},
{
"epoch": 1.4836266429048786,
"grad_norm": 16.289323806762695,
"learning_rate": 0.00015971423497582873,
"loss": 0.4534,
"step": 520
},
{
"epoch": 1.5121407885943419,
"grad_norm": 15.169504165649414,
"learning_rate": 0.00015509099446467557,
"loss": 0.4502,
"step": 530
},
{
"epoch": 1.540654934283805,
"grad_norm": 13.88201904296875,
"learning_rate": 0.00015046290584484455,
"loss": 0.4563,
"step": 540
},
{
"epoch": 1.569169079973268,
"grad_norm": 14.23528003692627,
"learning_rate": 0.00014583437640390112,
"loss": 0.4303,
"step": 550
},
{
"epoch": 1.5976832256627311,
"grad_norm": 13.917679786682129,
"learning_rate": 0.00014120981384920065,
"loss": 0.4396,
"step": 560
},
{
"epoch": 1.6261973713521942,
"grad_norm": 15.415020942687988,
"learning_rate": 0.0001365936221104682,
"loss": 0.4486,
"step": 570
},
{
"epoch": 1.6547115170416573,
"grad_norm": 13.339295387268066,
"learning_rate": 0.00013199019714597526,
"loss": 0.4303,
"step": 580
},
{
"epoch": 1.6832256627311204,
"grad_norm": 15.275891304016113,
"learning_rate": 0.00012740392275630802,
"loss": 0.4434,
"step": 590
},
{
"epoch": 1.7117398084205835,
"grad_norm": 15.294014930725098,
"learning_rate": 0.00012283916640971304,
"loss": 0.4352,
"step": 600
},
{
"epoch": 1.7402539541100468,
"grad_norm": 13.472885131835938,
"learning_rate": 0.00011830027508299607,
"loss": 0.4311,
"step": 610
},
{
"epoch": 1.7687680997995099,
"grad_norm": 13.640401840209961,
"learning_rate": 0.00011379157112193487,
"loss": 0.4192,
"step": 620
},
{
"epoch": 1.7972822454889732,
"grad_norm": 14.599030494689941,
"learning_rate": 0.00010931734812514786,
"loss": 0.4349,
"step": 630
},
{
"epoch": 1.8257963911784363,
"grad_norm": 13.727120399475098,
"learning_rate": 0.00010488186685533828,
"loss": 0.4348,
"step": 640
},
{
"epoch": 1.8543105368678994,
"grad_norm": 14.04608154296875,
"learning_rate": 0.00010048935118180787,
"loss": 0.428,
"step": 650
},
{
"epoch": 1.8828246825573625,
"grad_norm": 14.668761253356934,
"learning_rate": 9.614398405810378e-05,
"loss": 0.4271,
"step": 660
},
{
"epoch": 1.9113388282468255,
"grad_norm": 15.308387756347656,
"learning_rate": 9.18499035386292e-05,
"loss": 0.4153,
"step": 670
},
{
"epoch": 1.9398529739362886,
"grad_norm": 14.18338680267334,
"learning_rate": 8.761119883801097e-05,
"loss": 0.4237,
"step": 680
},
{
"epoch": 1.9683671196257517,
"grad_norm": 15.16002368927002,
"learning_rate": 8.343190643697685e-05,
"loss": 0.4097,
"step": 690
},
{
"epoch": 1.9968812653152148,
"grad_norm": 12.727019309997559,
"learning_rate": 7.931600623845105e-05,
"loss": 0.4035,
"step": 700
},
{
"epoch": 2.026286478057474,
"grad_norm": 16.409337997436523,
"learning_rate": 7.526741777752797e-05,
"loss": 0.2883,
"step": 710
},
{
"epoch": 2.054800623746937,
"grad_norm": 14.678768157958984,
"learning_rate": 7.128999648893393e-05,
"loss": 0.2779,
"step": 720
},
{
"epoch": 2.0833147694364,
"grad_norm": 15.050424575805664,
"learning_rate": 6.738753003553106e-05,
"loss": 0.2766,
"step": 730
},
{
"epoch": 2.111828915125863,
"grad_norm": 13.391814231872559,
"learning_rate": 6.356373470135943e-05,
"loss": 0.2557,
"step": 740
},
{
"epoch": 2.140343060815326,
"grad_norm": 12.664278984069824,
"learning_rate": 5.982225185265335e-05,
"loss": 0.2649,
"step": 750
},
{
"epoch": 2.1688572065047893,
"grad_norm": 12.713603019714355,
"learning_rate": 5.61666444702003e-05,
"loss": 0.2607,
"step": 760
},
{
"epoch": 2.197371352194253,
"grad_norm": 14.50146770477295,
"learning_rate": 5.260039375634626e-05,
"loss": 0.2569,
"step": 770
},
{
"epoch": 2.225885497883716,
"grad_norm": 14.117573738098145,
"learning_rate": 4.91268958198777e-05,
"loss": 0.2528,
"step": 780
},
{
"epoch": 2.254399643573179,
"grad_norm": 13.352962493896484,
"learning_rate": 4.5749458441937426e-05,
"loss": 0.2594,
"step": 790
},
{
"epoch": 2.282913789262642,
"grad_norm": 14.24978256225586,
"learning_rate": 4.24712979260541e-05,
"loss": 0.246,
"step": 800
},
{
"epoch": 2.311427934952105,
"grad_norm": 14.509572982788086,
"learning_rate": 3.9295536035284975e-05,
"loss": 0.2456,
"step": 810
},
{
"epoch": 2.3399420806415683,
"grad_norm": 13.541816711425781,
"learning_rate": 3.622519701938879e-05,
"loss": 0.2596,
"step": 820
},
{
"epoch": 2.3684562263310314,
"grad_norm": 13.303231239318848,
"learning_rate": 3.326320473485965e-05,
"loss": 0.2478,
"step": 830
},
{
"epoch": 2.3969703720204945,
"grad_norm": 13.914246559143066,
"learning_rate": 3.0412379860564546e-05,
"loss": 0.2471,
"step": 840
},
{
"epoch": 2.4254845177099575,
"grad_norm": 13.636366844177246,
"learning_rate": 2.7675437211635994e-05,
"loss": 0.244,
"step": 850
},
{
"epoch": 2.4539986633994206,
"grad_norm": 13.738758087158203,
"learning_rate": 2.505498315417775e-05,
"loss": 0.25,
"step": 860
},
{
"epoch": 2.4825128090888837,
"grad_norm": 12.734477043151855,
"learning_rate": 2.2553513123245593e-05,
"loss": 0.2514,
"step": 870
},
{
"epoch": 2.5110269547783473,
"grad_norm": 13.232802391052246,
"learning_rate": 2.017340924646676e-05,
"loss": 0.2517,
"step": 880
},
{
"epoch": 2.5395411004678103,
"grad_norm": 12.584091186523438,
"learning_rate": 1.791693807556106e-05,
"loss": 0.2423,
"step": 890
},
{
"epoch": 2.5680552461572734,
"grad_norm": 13.133004188537598,
"learning_rate": 1.5786248427923765e-05,
"loss": 0.2397,
"step": 900
},
{
"epoch": 2.5965693918467365,
"grad_norm": 12.750874519348145,
"learning_rate": 1.3783369340326038e-05,
"loss": 0.2402,
"step": 910
},
{
"epoch": 2.6250835375361996,
"grad_norm": 14.99782657623291,
"learning_rate": 1.191020813668126e-05,
"loss": 0.2325,
"step": 920
},
{
"epoch": 2.6535976832256627,
"grad_norm": 13.00296401977539,
"learning_rate": 1.0168548611717453e-05,
"loss": 0.2447,
"step": 930
},
{
"epoch": 2.682111828915126,
"grad_norm": 12.952726364135742,
"learning_rate": 8.560049332285445e-06,
"loss": 0.2324,
"step": 940
},
{
"epoch": 2.710625974604589,
"grad_norm": 12.093839645385742,
"learning_rate": 7.086242057920466e-06,
"loss": 0.2374,
"step": 950
},
{
"epoch": 2.739140120294052,
"grad_norm": 13.643256187438965,
"learning_rate": 5.748530282161151e-06,
"loss": 0.2375,
"step": 960
},
{
"epoch": 2.7676542659835155,
"grad_norm": 12.18138599395752,
"learning_rate": 4.548187896015132e-06,
"loss": 0.2398,
"step": 970
},
{
"epoch": 2.796168411672978,
"grad_norm": 13.081153869628906,
"learning_rate": 3.4863579748440395e-06,
"loss": 0.2414,
"step": 980
},
{
"epoch": 2.8246825573624417,
"grad_norm": 12.803534507751465,
"learning_rate": 2.5640516898229824e-06,
"loss": 0.2353,
"step": 990
},
{
"epoch": 2.8531967030519048,
"grad_norm": 14.616987228393555,
"learning_rate": 1.7821473450112257e-06,
"loss": 0.2387,
"step": 1000
},
{
"epoch": 2.881710848741368,
"grad_norm": 13.588176727294922,
"learning_rate": 1.1413895409510932e-06,
"loss": 0.2393,
"step": 1010
},
{
"epoch": 2.910224994430831,
"grad_norm": 13.362626075744629,
"learning_rate": 6.423884655915035e-07,
"loss": 0.2452,
"step": 1020
},
{
"epoch": 2.938739140120294,
"grad_norm": 13.089447021484375,
"learning_rate": 2.85619313211366e-07,
"loss": 0.2442,
"step": 1030
},
{
"epoch": 2.967253285809757,
"grad_norm": 14.710633277893066,
"learning_rate": 7.142183189641215e-08,
"loss": 0.2395,
"step": 1040
},
{
"epoch": 2.99576743149922,
"grad_norm": 13.993697166442871,
"learning_rate": 0.0,
"loss": 0.2403,
"step": 1050
}
],
"logging_steps": 10,
"max_steps": 1050,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.2082385668508221e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}