diff --git "a/checkpoint-1000/trainer_state.json" "b/checkpoint-1000/trainer_state.json" deleted file mode 100644--- "a/checkpoint-1000/trainer_state.json" +++ /dev/null @@ -1,7021 +0,0 @@ -{ - "best_metric": null, - "best_model_checkpoint": null, - "epoch": 0.45714285714285713, - "eval_steps": 500, - "global_step": 1000, - "is_hyper_param_search": false, - "is_local_process_zero": true, - "is_world_process_zero": true, - "log_history": [ - { - "epoch": 0.0, - "grad_norm": 2.563270330429077, - "learning_rate": 4e-05, - "loss": 2.4127, - "step": 1 - }, - { - "epoch": 0.0, - "grad_norm": 2.333299160003662, - "learning_rate": 8e-05, - "loss": 2.3102, - "step": 2 - }, - { - "epoch": 0.0, - "grad_norm": 1.8613317012786865, - "learning_rate": 0.00012, - "loss": 2.1983, - "step": 3 - }, - { - "epoch": 0.0, - "grad_norm": 1.620126485824585, - "learning_rate": 0.00016, - "loss": 2.0917, - "step": 4 - }, - { - "epoch": 0.0, - "grad_norm": 1.0352119207382202, - "learning_rate": 0.0002, - "loss": 1.9613, - "step": 5 - }, - { - "epoch": 0.0, - "grad_norm": 0.8601917624473572, - "learning_rate": 0.00019979899497487438, - "loss": 1.6927, - "step": 6 - }, - { - "epoch": 0.0, - "grad_norm": 1.273868203163147, - "learning_rate": 0.00019959798994974876, - "loss": 1.6828, - "step": 7 - }, - { - "epoch": 0.0, - "grad_norm": 1.0873847007751465, - "learning_rate": 0.00019939698492462313, - "loss": 1.5088, - "step": 8 - }, - { - "epoch": 0.0, - "grad_norm": 0.5800795555114746, - "learning_rate": 0.0001991959798994975, - "loss": 1.3702, - "step": 9 - }, - { - "epoch": 0.0, - "grad_norm": 0.693160355091095, - "learning_rate": 0.00019899497487437187, - "loss": 1.3718, - "step": 10 - }, - { - "epoch": 0.01, - "grad_norm": 0.49409618973731995, - "learning_rate": 0.00019879396984924622, - "loss": 1.3583, - "step": 11 - }, - { - "epoch": 0.01, - "grad_norm": 0.47029319405555725, - "learning_rate": 0.00019859296482412062, - "loss": 1.2791, - "step": 12 - }, - { - "epoch": 0.01, - "grad_norm": 0.43618088960647583, - "learning_rate": 0.000198391959798995, - "loss": 1.3161, - "step": 13 - }, - { - "epoch": 0.01, - "grad_norm": 0.3907912075519562, - "learning_rate": 0.00019819095477386937, - "loss": 1.2954, - "step": 14 - }, - { - "epoch": 0.01, - "grad_norm": 0.6292415857315063, - "learning_rate": 0.0001979899497487437, - "loss": 1.3397, - "step": 15 - }, - { - "epoch": 0.01, - "grad_norm": 0.37423521280288696, - "learning_rate": 0.0001977889447236181, - "loss": 1.3983, - "step": 16 - }, - { - "epoch": 0.01, - "grad_norm": 0.3845643699169159, - "learning_rate": 0.00019758793969849249, - "loss": 1.3349, - "step": 17 - }, - { - "epoch": 0.01, - "grad_norm": 0.3657298982143402, - "learning_rate": 0.00019738693467336683, - "loss": 1.2767, - "step": 18 - }, - { - "epoch": 0.01, - "grad_norm": 0.3727971315383911, - "learning_rate": 0.0001971859296482412, - "loss": 1.3672, - "step": 19 - }, - { - "epoch": 0.01, - "grad_norm": 0.35123032331466675, - "learning_rate": 0.0001969849246231156, - "loss": 1.3692, - "step": 20 - }, - { - "epoch": 0.01, - "grad_norm": 0.4003850221633911, - "learning_rate": 0.00019678391959798995, - "loss": 1.3412, - "step": 21 - }, - { - "epoch": 0.01, - "grad_norm": 0.3638221323490143, - "learning_rate": 0.00019658291457286432, - "loss": 1.2813, - "step": 22 - }, - { - "epoch": 0.01, - "grad_norm": 0.391216903924942, - "learning_rate": 0.0001963819095477387, - "loss": 1.2853, - "step": 23 - }, - { - "epoch": 0.01, - "grad_norm": 0.4370620846748352, - "learning_rate": 0.0001961809045226131, - "loss": 1.2524, - "step": 24 - }, - { - "epoch": 0.01, - "grad_norm": 0.3566085696220398, - "learning_rate": 0.00019597989949748744, - "loss": 1.3192, - "step": 25 - }, - { - "epoch": 0.01, - "grad_norm": 0.35438084602355957, - "learning_rate": 0.00019577889447236181, - "loss": 1.2858, - "step": 26 - }, - { - "epoch": 0.01, - "grad_norm": 0.3968108296394348, - "learning_rate": 0.0001955778894472362, - "loss": 1.3112, - "step": 27 - }, - { - "epoch": 0.01, - "grad_norm": 0.36512017250061035, - "learning_rate": 0.00019537688442211056, - "loss": 1.278, - "step": 28 - }, - { - "epoch": 0.01, - "grad_norm": 0.3982504606246948, - "learning_rate": 0.00019517587939698493, - "loss": 1.2392, - "step": 29 - }, - { - "epoch": 0.01, - "grad_norm": 0.38377949595451355, - "learning_rate": 0.0001949748743718593, - "loss": 1.2843, - "step": 30 - }, - { - "epoch": 0.01, - "grad_norm": 0.3582867980003357, - "learning_rate": 0.00019477386934673368, - "loss": 1.3008, - "step": 31 - }, - { - "epoch": 0.01, - "grad_norm": 0.3572194576263428, - "learning_rate": 0.00019457286432160805, - "loss": 1.294, - "step": 32 - }, - { - "epoch": 0.02, - "grad_norm": 0.35502907633781433, - "learning_rate": 0.00019437185929648243, - "loss": 1.3877, - "step": 33 - }, - { - "epoch": 0.02, - "grad_norm": 0.3649040460586548, - "learning_rate": 0.0001941708542713568, - "loss": 1.2966, - "step": 34 - }, - { - "epoch": 0.02, - "grad_norm": 0.3649256229400635, - "learning_rate": 0.00019396984924623117, - "loss": 1.2354, - "step": 35 - }, - { - "epoch": 0.02, - "grad_norm": 0.36085084080696106, - "learning_rate": 0.00019376884422110552, - "loss": 1.2409, - "step": 36 - }, - { - "epoch": 0.02, - "grad_norm": 0.35929059982299805, - "learning_rate": 0.00019356783919597992, - "loss": 1.243, - "step": 37 - }, - { - "epoch": 0.02, - "grad_norm": 0.3897881805896759, - "learning_rate": 0.0001933668341708543, - "loss": 1.3945, - "step": 38 - }, - { - "epoch": 0.02, - "grad_norm": 0.35484543442726135, - "learning_rate": 0.00019316582914572864, - "loss": 1.3433, - "step": 39 - }, - { - "epoch": 0.02, - "grad_norm": 0.35691192746162415, - "learning_rate": 0.000192964824120603, - "loss": 1.3243, - "step": 40 - }, - { - "epoch": 0.02, - "grad_norm": 0.3804129958152771, - "learning_rate": 0.0001927638190954774, - "loss": 1.2509, - "step": 41 - }, - { - "epoch": 0.02, - "grad_norm": 0.3623339831829071, - "learning_rate": 0.00019256281407035178, - "loss": 1.1799, - "step": 42 - }, - { - "epoch": 0.02, - "grad_norm": 0.3411855697631836, - "learning_rate": 0.00019236180904522613, - "loss": 1.2372, - "step": 43 - }, - { - "epoch": 0.02, - "grad_norm": 0.36590930819511414, - "learning_rate": 0.0001921608040201005, - "loss": 1.2585, - "step": 44 - }, - { - "epoch": 0.02, - "grad_norm": 0.30974116921424866, - "learning_rate": 0.0001919597989949749, - "loss": 1.2974, - "step": 45 - }, - { - "epoch": 0.02, - "grad_norm": 0.32794803380966187, - "learning_rate": 0.00019175879396984925, - "loss": 1.2696, - "step": 46 - }, - { - "epoch": 0.02, - "grad_norm": 0.33263906836509705, - "learning_rate": 0.00019155778894472362, - "loss": 1.3209, - "step": 47 - }, - { - "epoch": 0.02, - "grad_norm": 0.31748828291893005, - "learning_rate": 0.000191356783919598, - "loss": 1.278, - "step": 48 - }, - { - "epoch": 0.02, - "grad_norm": 0.34738561511039734, - "learning_rate": 0.0001911557788944724, - "loss": 1.2105, - "step": 49 - }, - { - "epoch": 0.02, - "grad_norm": 0.3313944339752197, - "learning_rate": 0.00019095477386934674, - "loss": 1.2527, - "step": 50 - }, - { - "epoch": 0.02, - "grad_norm": 0.33137476444244385, - "learning_rate": 0.0001907537688442211, - "loss": 1.2984, - "step": 51 - }, - { - "epoch": 0.02, - "grad_norm": 0.31752490997314453, - "learning_rate": 0.00019055276381909548, - "loss": 1.307, - "step": 52 - }, - { - "epoch": 0.02, - "grad_norm": 0.3111082911491394, - "learning_rate": 0.00019035175879396986, - "loss": 1.2769, - "step": 53 - }, - { - "epoch": 0.02, - "grad_norm": 0.31065696477890015, - "learning_rate": 0.00019015075376884423, - "loss": 1.3082, - "step": 54 - }, - { - "epoch": 0.03, - "grad_norm": 0.3382773697376251, - "learning_rate": 0.0001899497487437186, - "loss": 1.2744, - "step": 55 - }, - { - "epoch": 0.03, - "grad_norm": 0.34320947527885437, - "learning_rate": 0.00018974874371859298, - "loss": 1.3013, - "step": 56 - }, - { - "epoch": 0.03, - "grad_norm": 0.33131280541419983, - "learning_rate": 0.00018954773869346732, - "loss": 1.4066, - "step": 57 - }, - { - "epoch": 0.03, - "grad_norm": 0.3357389569282532, - "learning_rate": 0.00018934673366834172, - "loss": 1.2841, - "step": 58 - }, - { - "epoch": 0.03, - "grad_norm": 0.3200838267803192, - "learning_rate": 0.0001891457286432161, - "loss": 1.2654, - "step": 59 - }, - { - "epoch": 0.03, - "grad_norm": 0.3336584270000458, - "learning_rate": 0.00018894472361809047, - "loss": 1.1716, - "step": 60 - }, - { - "epoch": 0.03, - "grad_norm": 0.3128441274166107, - "learning_rate": 0.00018874371859296481, - "loss": 1.3009, - "step": 61 - }, - { - "epoch": 0.03, - "grad_norm": 0.30249112844467163, - "learning_rate": 0.00018854271356783921, - "loss": 1.2311, - "step": 62 - }, - { - "epoch": 0.03, - "grad_norm": 0.3263241946697235, - "learning_rate": 0.0001883417085427136, - "loss": 1.2344, - "step": 63 - }, - { - "epoch": 0.03, - "grad_norm": 0.32348358631134033, - "learning_rate": 0.00018814070351758793, - "loss": 1.3023, - "step": 64 - }, - { - "epoch": 0.03, - "grad_norm": 0.6508419513702393, - "learning_rate": 0.0001879396984924623, - "loss": 1.2028, - "step": 65 - }, - { - "epoch": 0.03, - "grad_norm": 0.34560996294021606, - "learning_rate": 0.0001877386934673367, - "loss": 1.389, - "step": 66 - }, - { - "epoch": 0.03, - "grad_norm": 0.36555996537208557, - "learning_rate": 0.00018753768844221108, - "loss": 1.3653, - "step": 67 - }, - { - "epoch": 0.03, - "grad_norm": 0.3195466697216034, - "learning_rate": 0.00018733668341708543, - "loss": 1.2412, - "step": 68 - }, - { - "epoch": 0.03, - "grad_norm": 0.30555933713912964, - "learning_rate": 0.0001871356783919598, - "loss": 1.2357, - "step": 69 - }, - { - "epoch": 0.03, - "grad_norm": 0.30776411294937134, - "learning_rate": 0.0001869346733668342, - "loss": 1.3112, - "step": 70 - }, - { - "epoch": 0.03, - "grad_norm": 0.31933915615081787, - "learning_rate": 0.00018673366834170854, - "loss": 1.1951, - "step": 71 - }, - { - "epoch": 0.03, - "grad_norm": 0.3241545259952545, - "learning_rate": 0.00018653266331658292, - "loss": 1.2717, - "step": 72 - }, - { - "epoch": 0.03, - "grad_norm": 0.3117482364177704, - "learning_rate": 0.0001863316582914573, - "loss": 1.3031, - "step": 73 - }, - { - "epoch": 0.03, - "grad_norm": 0.33056551218032837, - "learning_rate": 0.0001861306532663317, - "loss": 1.2098, - "step": 74 - }, - { - "epoch": 0.03, - "grad_norm": 0.32441195845603943, - "learning_rate": 0.00018592964824120604, - "loss": 1.2135, - "step": 75 - }, - { - "epoch": 0.03, - "grad_norm": 0.34216779470443726, - "learning_rate": 0.0001857286432160804, - "loss": 1.2531, - "step": 76 - }, - { - "epoch": 0.04, - "grad_norm": 0.32885247468948364, - "learning_rate": 0.00018552763819095478, - "loss": 1.3054, - "step": 77 - }, - { - "epoch": 0.04, - "grad_norm": 0.34541794657707214, - "learning_rate": 0.00018532663316582915, - "loss": 1.3207, - "step": 78 - }, - { - "epoch": 0.04, - "grad_norm": 0.30329057574272156, - "learning_rate": 0.00018512562814070353, - "loss": 1.2652, - "step": 79 - }, - { - "epoch": 0.04, - "grad_norm": 0.31469491124153137, - "learning_rate": 0.0001849246231155779, - "loss": 1.1961, - "step": 80 - }, - { - "epoch": 0.04, - "grad_norm": 0.3181230127811432, - "learning_rate": 0.00018472361809045227, - "loss": 1.3111, - "step": 81 - }, - { - "epoch": 0.04, - "grad_norm": 0.3181725740432739, - "learning_rate": 0.00018452261306532662, - "loss": 1.3353, - "step": 82 - }, - { - "epoch": 0.04, - "grad_norm": 0.3154084384441376, - "learning_rate": 0.00018432160804020102, - "loss": 1.2418, - "step": 83 - }, - { - "epoch": 0.04, - "grad_norm": 0.35061103105545044, - "learning_rate": 0.0001841206030150754, - "loss": 1.2332, - "step": 84 - }, - { - "epoch": 0.04, - "grad_norm": 0.3259966969490051, - "learning_rate": 0.00018391959798994977, - "loss": 1.3633, - "step": 85 - }, - { - "epoch": 0.04, - "grad_norm": 0.31192857027053833, - "learning_rate": 0.0001837185929648241, - "loss": 1.1886, - "step": 86 - }, - { - "epoch": 0.04, - "grad_norm": 0.32024237513542175, - "learning_rate": 0.0001835175879396985, - "loss": 1.2141, - "step": 87 - }, - { - "epoch": 0.04, - "grad_norm": 0.302498459815979, - "learning_rate": 0.00018331658291457288, - "loss": 1.237, - "step": 88 - }, - { - "epoch": 0.04, - "grad_norm": 0.3569789230823517, - "learning_rate": 0.00018311557788944723, - "loss": 1.3015, - "step": 89 - }, - { - "epoch": 0.04, - "grad_norm": 0.3121156692504883, - "learning_rate": 0.0001829145728643216, - "loss": 1.281, - "step": 90 - }, - { - "epoch": 0.04, - "grad_norm": 0.31279826164245605, - "learning_rate": 0.000182713567839196, - "loss": 1.2924, - "step": 91 - }, - { - "epoch": 0.04, - "grad_norm": 0.3210877478122711, - "learning_rate": 0.00018251256281407038, - "loss": 1.3082, - "step": 92 - }, - { - "epoch": 0.04, - "grad_norm": 0.331406831741333, - "learning_rate": 0.00018231155778894472, - "loss": 1.2434, - "step": 93 - }, - { - "epoch": 0.04, - "grad_norm": 0.3135213255882263, - "learning_rate": 0.0001821105527638191, - "loss": 1.2188, - "step": 94 - }, - { - "epoch": 0.04, - "grad_norm": 0.31146401166915894, - "learning_rate": 0.0001819095477386935, - "loss": 1.2484, - "step": 95 - }, - { - "epoch": 0.04, - "grad_norm": 0.32071712613105774, - "learning_rate": 0.00018170854271356784, - "loss": 1.1927, - "step": 96 - }, - { - "epoch": 0.04, - "grad_norm": 0.3343571722507477, - "learning_rate": 0.00018150753768844221, - "loss": 1.3443, - "step": 97 - }, - { - "epoch": 0.04, - "grad_norm": 0.3510550558567047, - "learning_rate": 0.0001813065326633166, - "loss": 1.2832, - "step": 98 - }, - { - "epoch": 0.05, - "grad_norm": 0.33436939120292664, - "learning_rate": 0.00018110552763819096, - "loss": 1.252, - "step": 99 - }, - { - "epoch": 0.05, - "grad_norm": 0.3175451159477234, - "learning_rate": 0.00018090452261306533, - "loss": 1.251, - "step": 100 - }, - { - "epoch": 0.05, - "grad_norm": 0.32603979110717773, - "learning_rate": 0.0001807035175879397, - "loss": 1.228, - "step": 101 - }, - { - "epoch": 0.05, - "grad_norm": 0.3073003590106964, - "learning_rate": 0.00018050251256281408, - "loss": 1.2659, - "step": 102 - }, - { - "epoch": 0.05, - "grad_norm": 0.3285619616508484, - "learning_rate": 0.00018030150753768845, - "loss": 1.2826, - "step": 103 - }, - { - "epoch": 0.05, - "grad_norm": 0.3038572072982788, - "learning_rate": 0.00018010050251256282, - "loss": 1.217, - "step": 104 - }, - { - "epoch": 0.05, - "grad_norm": 0.35778746008872986, - "learning_rate": 0.0001798994974874372, - "loss": 1.2901, - "step": 105 - }, - { - "epoch": 0.05, - "grad_norm": 0.2900612950325012, - "learning_rate": 0.00017969849246231157, - "loss": 1.2651, - "step": 106 - }, - { - "epoch": 0.05, - "grad_norm": 0.32928743958473206, - "learning_rate": 0.00017949748743718592, - "loss": 1.3143, - "step": 107 - }, - { - "epoch": 0.05, - "grad_norm": 0.32471874356269836, - "learning_rate": 0.00017929648241206032, - "loss": 1.1834, - "step": 108 - }, - { - "epoch": 0.05, - "grad_norm": 0.30989256501197815, - "learning_rate": 0.0001790954773869347, - "loss": 1.2216, - "step": 109 - }, - { - "epoch": 0.05, - "grad_norm": 0.3371771275997162, - "learning_rate": 0.00017889447236180906, - "loss": 1.197, - "step": 110 - }, - { - "epoch": 0.05, - "grad_norm": 0.31041428446769714, - "learning_rate": 0.0001786934673366834, - "loss": 1.27, - "step": 111 - }, - { - "epoch": 0.05, - "grad_norm": 0.3152185082435608, - "learning_rate": 0.0001784924623115578, - "loss": 1.2436, - "step": 112 - }, - { - "epoch": 0.05, - "grad_norm": 0.3227459490299225, - "learning_rate": 0.00017829145728643218, - "loss": 1.2401, - "step": 113 - }, - { - "epoch": 0.05, - "grad_norm": 0.3246959149837494, - "learning_rate": 0.00017809045226130653, - "loss": 1.2703, - "step": 114 - }, - { - "epoch": 0.05, - "grad_norm": 0.38032859563827515, - "learning_rate": 0.0001778894472361809, - "loss": 1.3266, - "step": 115 - }, - { - "epoch": 0.05, - "grad_norm": 0.33325478434562683, - "learning_rate": 0.0001776884422110553, - "loss": 1.2954, - "step": 116 - }, - { - "epoch": 0.05, - "grad_norm": 0.3178690969944, - "learning_rate": 0.00017748743718592967, - "loss": 1.1793, - "step": 117 - }, - { - "epoch": 0.05, - "grad_norm": 0.31393784284591675, - "learning_rate": 0.00017728643216080402, - "loss": 1.277, - "step": 118 - }, - { - "epoch": 0.05, - "grad_norm": 0.3150279223918915, - "learning_rate": 0.0001770854271356784, - "loss": 1.293, - "step": 119 - }, - { - "epoch": 0.05, - "grad_norm": 0.32476913928985596, - "learning_rate": 0.0001768844221105528, - "loss": 1.2569, - "step": 120 - }, - { - "epoch": 0.06, - "grad_norm": 0.36075925827026367, - "learning_rate": 0.00017668341708542714, - "loss": 1.205, - "step": 121 - }, - { - "epoch": 0.06, - "grad_norm": 0.33134496212005615, - "learning_rate": 0.0001764824120603015, - "loss": 1.2299, - "step": 122 - }, - { - "epoch": 0.06, - "grad_norm": 0.30507662892341614, - "learning_rate": 0.00017628140703517588, - "loss": 1.2883, - "step": 123 - }, - { - "epoch": 0.06, - "grad_norm": 0.34049952030181885, - "learning_rate": 0.00017608040201005026, - "loss": 1.214, - "step": 124 - }, - { - "epoch": 0.06, - "grad_norm": 0.3405919373035431, - "learning_rate": 0.00017587939698492463, - "loss": 1.2738, - "step": 125 - }, - { - "epoch": 0.06, - "grad_norm": 0.3306083679199219, - "learning_rate": 0.000175678391959799, - "loss": 1.2415, - "step": 126 - }, - { - "epoch": 0.06, - "grad_norm": 0.33770737051963806, - "learning_rate": 0.00017547738693467338, - "loss": 1.3233, - "step": 127 - }, - { - "epoch": 0.06, - "grad_norm": 0.3261878788471222, - "learning_rate": 0.00017527638190954775, - "loss": 1.2695, - "step": 128 - }, - { - "epoch": 0.06, - "grad_norm": 0.3433193266391754, - "learning_rate": 0.00017507537688442212, - "loss": 1.2052, - "step": 129 - }, - { - "epoch": 0.06, - "grad_norm": 0.3111405670642853, - "learning_rate": 0.0001748743718592965, - "loss": 1.2798, - "step": 130 - }, - { - "epoch": 0.06, - "grad_norm": 0.3630310297012329, - "learning_rate": 0.00017467336683417087, - "loss": 1.2567, - "step": 131 - }, - { - "epoch": 0.06, - "grad_norm": 0.31963038444519043, - "learning_rate": 0.00017447236180904521, - "loss": 1.2455, - "step": 132 - }, - { - "epoch": 0.06, - "grad_norm": 0.299695760011673, - "learning_rate": 0.00017427135678391961, - "loss": 1.207, - "step": 133 - }, - { - "epoch": 0.06, - "grad_norm": 0.3167514503002167, - "learning_rate": 0.000174070351758794, - "loss": 1.2378, - "step": 134 - }, - { - "epoch": 0.06, - "grad_norm": 0.31375688314437866, - "learning_rate": 0.00017386934673366836, - "loss": 1.2658, - "step": 135 - }, - { - "epoch": 0.06, - "grad_norm": 0.34311383962631226, - "learning_rate": 0.0001736683417085427, - "loss": 1.2004, - "step": 136 - }, - { - "epoch": 0.06, - "grad_norm": 0.31706517934799194, - "learning_rate": 0.0001734673366834171, - "loss": 1.1879, - "step": 137 - }, - { - "epoch": 0.06, - "grad_norm": 0.31296172738075256, - "learning_rate": 0.00017326633165829148, - "loss": 1.1866, - "step": 138 - }, - { - "epoch": 0.06, - "grad_norm": 0.3254072368144989, - "learning_rate": 0.00017306532663316582, - "loss": 1.1952, - "step": 139 - }, - { - "epoch": 0.06, - "grad_norm": 0.3165453374385834, - "learning_rate": 0.0001728643216080402, - "loss": 1.3459, - "step": 140 - }, - { - "epoch": 0.06, - "grad_norm": 0.35455992817878723, - "learning_rate": 0.0001726633165829146, - "loss": 1.2494, - "step": 141 - }, - { - "epoch": 0.06, - "grad_norm": 0.3116908073425293, - "learning_rate": 0.00017246231155778897, - "loss": 1.2225, - "step": 142 - }, - { - "epoch": 0.07, - "grad_norm": 0.3141638934612274, - "learning_rate": 0.00017226130653266332, - "loss": 1.3385, - "step": 143 - }, - { - "epoch": 0.07, - "grad_norm": 0.3096507787704468, - "learning_rate": 0.0001720603015075377, - "loss": 1.3257, - "step": 144 - }, - { - "epoch": 0.07, - "grad_norm": 0.3160630464553833, - "learning_rate": 0.00017185929648241206, - "loss": 1.2683, - "step": 145 - }, - { - "epoch": 0.07, - "grad_norm": 0.3342824876308441, - "learning_rate": 0.00017165829145728644, - "loss": 1.2551, - "step": 146 - }, - { - "epoch": 0.07, - "grad_norm": 0.3086145222187042, - "learning_rate": 0.0001714572864321608, - "loss": 1.2456, - "step": 147 - }, - { - "epoch": 0.07, - "grad_norm": 0.3001709282398224, - "learning_rate": 0.00017125628140703518, - "loss": 1.2287, - "step": 148 - }, - { - "epoch": 0.07, - "grad_norm": 0.3277103304862976, - "learning_rate": 0.00017105527638190955, - "loss": 1.3302, - "step": 149 - }, - { - "epoch": 0.07, - "grad_norm": 0.33616161346435547, - "learning_rate": 0.00017085427135678393, - "loss": 1.2604, - "step": 150 - }, - { - "epoch": 0.07, - "grad_norm": 0.3231915533542633, - "learning_rate": 0.0001706532663316583, - "loss": 1.2367, - "step": 151 - }, - { - "epoch": 0.07, - "grad_norm": 0.3305569291114807, - "learning_rate": 0.00017045226130653267, - "loss": 1.2387, - "step": 152 - }, - { - "epoch": 0.07, - "grad_norm": 0.35031118988990784, - "learning_rate": 0.00017025125628140705, - "loss": 1.2464, - "step": 153 - }, - { - "epoch": 0.07, - "grad_norm": 0.3142334222793579, - "learning_rate": 0.00017005025125628142, - "loss": 1.3614, - "step": 154 - }, - { - "epoch": 0.07, - "grad_norm": 0.31159430742263794, - "learning_rate": 0.0001698492462311558, - "loss": 1.2556, - "step": 155 - }, - { - "epoch": 0.07, - "grad_norm": 0.3273050785064697, - "learning_rate": 0.00016964824120603016, - "loss": 1.3519, - "step": 156 - }, - { - "epoch": 0.07, - "grad_norm": 0.3299296796321869, - "learning_rate": 0.0001694472361809045, - "loss": 1.1763, - "step": 157 - }, - { - "epoch": 0.07, - "grad_norm": 0.33138513565063477, - "learning_rate": 0.0001692462311557789, - "loss": 1.17, - "step": 158 - }, - { - "epoch": 0.07, - "grad_norm": 0.30424776673316956, - "learning_rate": 0.00016904522613065328, - "loss": 1.123, - "step": 159 - }, - { - "epoch": 0.07, - "grad_norm": 0.3452983498573303, - "learning_rate": 0.00016884422110552766, - "loss": 1.2999, - "step": 160 - }, - { - "epoch": 0.07, - "grad_norm": 0.33614206314086914, - "learning_rate": 0.000168643216080402, - "loss": 1.262, - "step": 161 - }, - { - "epoch": 0.07, - "grad_norm": 0.32416558265686035, - "learning_rate": 0.0001684422110552764, - "loss": 1.2514, - "step": 162 - }, - { - "epoch": 0.07, - "grad_norm": 0.29827457666397095, - "learning_rate": 0.00016824120603015078, - "loss": 1.2461, - "step": 163 - }, - { - "epoch": 0.07, - "grad_norm": 0.32572871446609497, - "learning_rate": 0.00016804020100502512, - "loss": 1.2393, - "step": 164 - }, - { - "epoch": 0.08, - "grad_norm": 0.32171282172203064, - "learning_rate": 0.0001678391959798995, - "loss": 1.3045, - "step": 165 - }, - { - "epoch": 0.08, - "grad_norm": 0.34592801332473755, - "learning_rate": 0.0001676381909547739, - "loss": 1.2669, - "step": 166 - }, - { - "epoch": 0.08, - "grad_norm": 0.33795440196990967, - "learning_rate": 0.00016743718592964827, - "loss": 1.1404, - "step": 167 - }, - { - "epoch": 0.08, - "grad_norm": 0.32598641514778137, - "learning_rate": 0.0001672361809045226, - "loss": 1.2495, - "step": 168 - }, - { - "epoch": 0.08, - "grad_norm": 0.31816181540489197, - "learning_rate": 0.00016703517587939699, - "loss": 1.3003, - "step": 169 - }, - { - "epoch": 0.08, - "grad_norm": 0.3340943157672882, - "learning_rate": 0.00016683417085427136, - "loss": 1.2615, - "step": 170 - }, - { - "epoch": 0.08, - "grad_norm": 0.3242477476596832, - "learning_rate": 0.00016663316582914573, - "loss": 1.2527, - "step": 171 - }, - { - "epoch": 0.08, - "grad_norm": 0.308652400970459, - "learning_rate": 0.0001664321608040201, - "loss": 1.3241, - "step": 172 - }, - { - "epoch": 0.08, - "grad_norm": 0.31818273663520813, - "learning_rate": 0.00016623115577889448, - "loss": 1.3712, - "step": 173 - }, - { - "epoch": 0.08, - "grad_norm": 0.32885751128196716, - "learning_rate": 0.00016603015075376885, - "loss": 1.2583, - "step": 174 - }, - { - "epoch": 0.08, - "grad_norm": 0.32561740279197693, - "learning_rate": 0.00016582914572864322, - "loss": 1.2458, - "step": 175 - }, - { - "epoch": 0.08, - "grad_norm": 0.3278496563434601, - "learning_rate": 0.0001656281407035176, - "loss": 1.2205, - "step": 176 - }, - { - "epoch": 0.08, - "grad_norm": 0.32530438899993896, - "learning_rate": 0.00016542713567839197, - "loss": 1.2235, - "step": 177 - }, - { - "epoch": 0.08, - "grad_norm": 0.31232836842536926, - "learning_rate": 0.00016522613065326634, - "loss": 1.199, - "step": 178 - }, - { - "epoch": 0.08, - "grad_norm": 0.3209743797779083, - "learning_rate": 0.00016502512562814072, - "loss": 1.2717, - "step": 179 - }, - { - "epoch": 0.08, - "grad_norm": 0.329940527677536, - "learning_rate": 0.0001648241206030151, - "loss": 1.2425, - "step": 180 - }, - { - "epoch": 0.08, - "grad_norm": 0.3144824802875519, - "learning_rate": 0.00016462311557788946, - "loss": 1.2444, - "step": 181 - }, - { - "epoch": 0.08, - "grad_norm": 0.3218553066253662, - "learning_rate": 0.0001644221105527638, - "loss": 1.2815, - "step": 182 - }, - { - "epoch": 0.08, - "grad_norm": 0.33460506796836853, - "learning_rate": 0.0001642211055276382, - "loss": 1.3774, - "step": 183 - }, - { - "epoch": 0.08, - "grad_norm": 0.3300727605819702, - "learning_rate": 0.00016402010050251258, - "loss": 1.3436, - "step": 184 - }, - { - "epoch": 0.08, - "grad_norm": 0.3530360460281372, - "learning_rate": 0.00016381909547738695, - "loss": 1.2605, - "step": 185 - }, - { - "epoch": 0.09, - "grad_norm": 0.3326485753059387, - "learning_rate": 0.0001636180904522613, - "loss": 1.2202, - "step": 186 - }, - { - "epoch": 0.09, - "grad_norm": 0.31355732679367065, - "learning_rate": 0.0001634170854271357, - "loss": 1.2798, - "step": 187 - }, - { - "epoch": 0.09, - "grad_norm": 0.3162304759025574, - "learning_rate": 0.00016321608040201007, - "loss": 1.2118, - "step": 188 - }, - { - "epoch": 0.09, - "grad_norm": 0.32264095544815063, - "learning_rate": 0.00016301507537688442, - "loss": 1.2775, - "step": 189 - }, - { - "epoch": 0.09, - "grad_norm": 0.30425918102264404, - "learning_rate": 0.0001628140703517588, - "loss": 1.1438, - "step": 190 - }, - { - "epoch": 0.09, - "grad_norm": 0.33907556533813477, - "learning_rate": 0.00016261306532663316, - "loss": 1.4077, - "step": 191 - }, - { - "epoch": 0.09, - "grad_norm": 0.32334232330322266, - "learning_rate": 0.00016241206030150756, - "loss": 1.2673, - "step": 192 - }, - { - "epoch": 0.09, - "grad_norm": 0.32999834418296814, - "learning_rate": 0.0001622110552763819, - "loss": 1.257, - "step": 193 - }, - { - "epoch": 0.09, - "grad_norm": 0.3223746120929718, - "learning_rate": 0.00016201005025125628, - "loss": 1.2125, - "step": 194 - }, - { - "epoch": 0.09, - "grad_norm": 0.3236989378929138, - "learning_rate": 0.00016180904522613066, - "loss": 1.3645, - "step": 195 - }, - { - "epoch": 0.09, - "grad_norm": 0.3303336203098297, - "learning_rate": 0.00016160804020100503, - "loss": 1.2786, - "step": 196 - }, - { - "epoch": 0.09, - "grad_norm": 0.3135005831718445, - "learning_rate": 0.0001614070351758794, - "loss": 1.2775, - "step": 197 - }, - { - "epoch": 0.09, - "grad_norm": 0.3185466527938843, - "learning_rate": 0.00016120603015075378, - "loss": 1.2128, - "step": 198 - }, - { - "epoch": 0.09, - "grad_norm": 0.3355714976787567, - "learning_rate": 0.00016100502512562815, - "loss": 1.307, - "step": 199 - }, - { - "epoch": 0.09, - "grad_norm": 0.339216023683548, - "learning_rate": 0.00016080402010050252, - "loss": 1.2845, - "step": 200 - }, - { - "epoch": 0.09, - "grad_norm": 0.335781455039978, - "learning_rate": 0.0001606030150753769, - "loss": 1.3783, - "step": 201 - }, - { - "epoch": 0.09, - "grad_norm": 0.3324490189552307, - "learning_rate": 0.00016040201005025127, - "loss": 1.3027, - "step": 202 - }, - { - "epoch": 0.09, - "grad_norm": 0.3381625711917877, - "learning_rate": 0.00016020100502512564, - "loss": 1.1912, - "step": 203 - }, - { - "epoch": 0.09, - "grad_norm": 0.3119088113307953, - "learning_rate": 0.00016, - "loss": 1.2789, - "step": 204 - }, - { - "epoch": 0.09, - "grad_norm": 0.32320040464401245, - "learning_rate": 0.00015979899497487439, - "loss": 1.2454, - "step": 205 - }, - { - "epoch": 0.09, - "grad_norm": 0.32925647497177124, - "learning_rate": 0.00015959798994974876, - "loss": 1.2602, - "step": 206 - }, - { - "epoch": 0.09, - "grad_norm": 0.32569676637649536, - "learning_rate": 0.0001593969849246231, - "loss": 1.2034, - "step": 207 - }, - { - "epoch": 0.1, - "grad_norm": 0.3359280824661255, - "learning_rate": 0.0001591959798994975, - "loss": 1.2624, - "step": 208 - }, - { - "epoch": 0.1, - "grad_norm": 0.3207138180732727, - "learning_rate": 0.00015899497487437188, - "loss": 1.2814, - "step": 209 - }, - { - "epoch": 0.1, - "grad_norm": 0.32691851258277893, - "learning_rate": 0.00015879396984924625, - "loss": 1.2213, - "step": 210 - }, - { - "epoch": 0.1, - "grad_norm": 0.33548569679260254, - "learning_rate": 0.0001585929648241206, - "loss": 1.133, - "step": 211 - }, - { - "epoch": 0.1, - "grad_norm": 0.3024287819862366, - "learning_rate": 0.000158391959798995, - "loss": 1.1925, - "step": 212 - }, - { - "epoch": 0.1, - "grad_norm": 0.3605235517024994, - "learning_rate": 0.00015819095477386937, - "loss": 1.3439, - "step": 213 - }, - { - "epoch": 0.1, - "grad_norm": 0.33820798993110657, - "learning_rate": 0.00015798994974874372, - "loss": 1.1816, - "step": 214 - }, - { - "epoch": 0.1, - "grad_norm": 0.32633164525032043, - "learning_rate": 0.0001577889447236181, - "loss": 1.2521, - "step": 215 - }, - { - "epoch": 0.1, - "grad_norm": 0.34430956840515137, - "learning_rate": 0.00015758793969849246, - "loss": 1.3119, - "step": 216 - }, - { - "epoch": 0.1, - "grad_norm": 0.32302767038345337, - "learning_rate": 0.00015738693467336686, - "loss": 1.2437, - "step": 217 - }, - { - "epoch": 0.1, - "grad_norm": 0.3343082070350647, - "learning_rate": 0.0001571859296482412, - "loss": 1.298, - "step": 218 - }, - { - "epoch": 0.1, - "grad_norm": 0.34785017371177673, - "learning_rate": 0.00015698492462311558, - "loss": 1.2371, - "step": 219 - }, - { - "epoch": 0.1, - "grad_norm": 0.3445768356323242, - "learning_rate": 0.00015678391959798995, - "loss": 1.2617, - "step": 220 - }, - { - "epoch": 0.1, - "grad_norm": 0.3664666414260864, - "learning_rate": 0.00015658291457286433, - "loss": 1.1819, - "step": 221 - }, - { - "epoch": 0.1, - "grad_norm": 0.3051821291446686, - "learning_rate": 0.0001563819095477387, - "loss": 1.3238, - "step": 222 - }, - { - "epoch": 0.1, - "grad_norm": 0.31191888451576233, - "learning_rate": 0.00015618090452261307, - "loss": 1.2955, - "step": 223 - }, - { - "epoch": 0.1, - "grad_norm": 0.33101990818977356, - "learning_rate": 0.00015597989949748745, - "loss": 1.2489, - "step": 224 - }, - { - "epoch": 0.1, - "grad_norm": 0.3157234489917755, - "learning_rate": 0.00015577889447236182, - "loss": 1.3097, - "step": 225 - }, - { - "epoch": 0.1, - "grad_norm": 0.29264160990715027, - "learning_rate": 0.0001555778894472362, - "loss": 1.2254, - "step": 226 - }, - { - "epoch": 0.1, - "grad_norm": 0.34887459874153137, - "learning_rate": 0.00015537688442211056, - "loss": 1.2773, - "step": 227 - }, - { - "epoch": 0.1, - "grad_norm": 0.3264656960964203, - "learning_rate": 0.00015517587939698494, - "loss": 1.2045, - "step": 228 - }, - { - "epoch": 0.1, - "grad_norm": 0.3183201849460602, - "learning_rate": 0.0001549748743718593, - "loss": 1.3232, - "step": 229 - }, - { - "epoch": 0.11, - "grad_norm": 0.3423653542995453, - "learning_rate": 0.00015477386934673368, - "loss": 1.2111, - "step": 230 - }, - { - "epoch": 0.11, - "grad_norm": 0.32361966371536255, - "learning_rate": 0.00015457286432160806, - "loss": 1.2766, - "step": 231 - }, - { - "epoch": 0.11, - "grad_norm": 0.35348379611968994, - "learning_rate": 0.0001543718592964824, - "loss": 1.2533, - "step": 232 - }, - { - "epoch": 0.11, - "grad_norm": 0.348850280046463, - "learning_rate": 0.0001541708542713568, - "loss": 1.2907, - "step": 233 - }, - { - "epoch": 0.11, - "grad_norm": 0.31669101119041443, - "learning_rate": 0.00015396984924623117, - "loss": 1.1785, - "step": 234 - }, - { - "epoch": 0.11, - "grad_norm": 0.3331408202648163, - "learning_rate": 0.00015376884422110555, - "loss": 1.2429, - "step": 235 - }, - { - "epoch": 0.11, - "grad_norm": 0.3349299728870392, - "learning_rate": 0.0001535678391959799, - "loss": 1.1955, - "step": 236 - }, - { - "epoch": 0.11, - "grad_norm": 0.3368314802646637, - "learning_rate": 0.00015336683417085427, - "loss": 1.3476, - "step": 237 - }, - { - "epoch": 0.11, - "grad_norm": 0.3111830949783325, - "learning_rate": 0.00015316582914572867, - "loss": 1.2382, - "step": 238 - }, - { - "epoch": 0.11, - "grad_norm": 0.32492902874946594, - "learning_rate": 0.000152964824120603, - "loss": 1.2453, - "step": 239 - }, - { - "epoch": 0.11, - "grad_norm": 0.3176097869873047, - "learning_rate": 0.00015276381909547739, - "loss": 1.2639, - "step": 240 - }, - { - "epoch": 0.11, - "grad_norm": 0.3306467831134796, - "learning_rate": 0.00015256281407035176, - "loss": 1.2979, - "step": 241 - }, - { - "epoch": 0.11, - "grad_norm": 0.33482393622398376, - "learning_rate": 0.00015236180904522613, - "loss": 1.2855, - "step": 242 - }, - { - "epoch": 0.11, - "grad_norm": 0.317231684923172, - "learning_rate": 0.0001521608040201005, - "loss": 1.336, - "step": 243 - }, - { - "epoch": 0.11, - "grad_norm": 0.3562380075454712, - "learning_rate": 0.00015195979899497488, - "loss": 1.2538, - "step": 244 - }, - { - "epoch": 0.11, - "grad_norm": 0.3557191491127014, - "learning_rate": 0.00015175879396984925, - "loss": 1.1941, - "step": 245 - }, - { - "epoch": 0.11, - "grad_norm": 0.33679717779159546, - "learning_rate": 0.00015155778894472362, - "loss": 1.2929, - "step": 246 - }, - { - "epoch": 0.11, - "grad_norm": 0.32738903164863586, - "learning_rate": 0.000151356783919598, - "loss": 1.1497, - "step": 247 - }, - { - "epoch": 0.11, - "grad_norm": 0.3145580589771271, - "learning_rate": 0.00015115577889447237, - "loss": 1.2982, - "step": 248 - }, - { - "epoch": 0.11, - "grad_norm": 0.3444727659225464, - "learning_rate": 0.00015095477386934674, - "loss": 1.3321, - "step": 249 - }, - { - "epoch": 0.11, - "grad_norm": 0.3220258355140686, - "learning_rate": 0.00015075376884422112, - "loss": 1.1777, - "step": 250 - }, - { - "epoch": 0.11, - "grad_norm": 0.32968461513519287, - "learning_rate": 0.0001505527638190955, - "loss": 1.2707, - "step": 251 - }, - { - "epoch": 0.12, - "grad_norm": 0.3543086647987366, - "learning_rate": 0.00015035175879396986, - "loss": 1.2997, - "step": 252 - }, - { - "epoch": 0.12, - "grad_norm": 0.32566267251968384, - "learning_rate": 0.00015015075376884423, - "loss": 1.2313, - "step": 253 - }, - { - "epoch": 0.12, - "grad_norm": 0.31076309084892273, - "learning_rate": 0.0001499497487437186, - "loss": 1.3003, - "step": 254 - }, - { - "epoch": 0.12, - "grad_norm": 0.33521464467048645, - "learning_rate": 0.00014974874371859298, - "loss": 1.2593, - "step": 255 - }, - { - "epoch": 0.12, - "grad_norm": 0.3666183054447174, - "learning_rate": 0.00014954773869346735, - "loss": 1.3473, - "step": 256 - }, - { - "epoch": 0.12, - "grad_norm": 0.36615288257598877, - "learning_rate": 0.0001493467336683417, - "loss": 1.2521, - "step": 257 - }, - { - "epoch": 0.12, - "grad_norm": 0.3385326564311981, - "learning_rate": 0.0001491457286432161, - "loss": 1.3117, - "step": 258 - }, - { - "epoch": 0.12, - "grad_norm": 0.32243263721466064, - "learning_rate": 0.00014894472361809047, - "loss": 1.1798, - "step": 259 - }, - { - "epoch": 0.12, - "grad_norm": 0.3227294087409973, - "learning_rate": 0.00014874371859296482, - "loss": 1.2362, - "step": 260 - }, - { - "epoch": 0.12, - "grad_norm": 0.3345654606819153, - "learning_rate": 0.0001485427135678392, - "loss": 1.2136, - "step": 261 - }, - { - "epoch": 0.12, - "grad_norm": 0.31397944688796997, - "learning_rate": 0.00014834170854271356, - "loss": 1.1987, - "step": 262 - }, - { - "epoch": 0.12, - "grad_norm": 0.3394251763820648, - "learning_rate": 0.00014814070351758796, - "loss": 1.2022, - "step": 263 - }, - { - "epoch": 0.12, - "grad_norm": 0.33496084809303284, - "learning_rate": 0.0001479396984924623, - "loss": 1.302, - "step": 264 - }, - { - "epoch": 0.12, - "grad_norm": 0.35757845640182495, - "learning_rate": 0.00014773869346733668, - "loss": 1.2574, - "step": 265 - }, - { - "epoch": 0.12, - "grad_norm": 0.332405686378479, - "learning_rate": 0.00014753768844221106, - "loss": 1.2773, - "step": 266 - }, - { - "epoch": 0.12, - "grad_norm": 0.32756730914115906, - "learning_rate": 0.00014733668341708543, - "loss": 1.2189, - "step": 267 - }, - { - "epoch": 0.12, - "grad_norm": 0.3382038176059723, - "learning_rate": 0.0001471356783919598, - "loss": 1.2143, - "step": 268 - }, - { - "epoch": 0.12, - "grad_norm": 0.35607925057411194, - "learning_rate": 0.00014693467336683417, - "loss": 1.3101, - "step": 269 - }, - { - "epoch": 0.12, - "grad_norm": 0.3490254282951355, - "learning_rate": 0.00014673366834170855, - "loss": 1.2813, - "step": 270 - }, - { - "epoch": 0.12, - "grad_norm": 0.34010350704193115, - "learning_rate": 0.00014653266331658292, - "loss": 1.1883, - "step": 271 - }, - { - "epoch": 0.12, - "grad_norm": 0.33997610211372375, - "learning_rate": 0.0001463316582914573, - "loss": 1.2936, - "step": 272 - }, - { - "epoch": 0.12, - "grad_norm": 0.3269011378288269, - "learning_rate": 0.00014613065326633167, - "loss": 1.2682, - "step": 273 - }, - { - "epoch": 0.13, - "grad_norm": 0.34441789984703064, - "learning_rate": 0.00014592964824120604, - "loss": 1.2556, - "step": 274 - }, - { - "epoch": 0.13, - "grad_norm": 0.3339982330799103, - "learning_rate": 0.0001457286432160804, - "loss": 1.3609, - "step": 275 - }, - { - "epoch": 0.13, - "grad_norm": 0.3199785649776459, - "learning_rate": 0.00014552763819095479, - "loss": 1.276, - "step": 276 - }, - { - "epoch": 0.13, - "grad_norm": 0.33970314264297485, - "learning_rate": 0.00014532663316582916, - "loss": 1.2971, - "step": 277 - }, - { - "epoch": 0.13, - "grad_norm": 0.3045497536659241, - "learning_rate": 0.00014512562814070353, - "loss": 1.2067, - "step": 278 - }, - { - "epoch": 0.13, - "grad_norm": 0.334547221660614, - "learning_rate": 0.0001449246231155779, - "loss": 1.2418, - "step": 279 - }, - { - "epoch": 0.13, - "grad_norm": 0.32451215386390686, - "learning_rate": 0.00014472361809045228, - "loss": 1.3319, - "step": 280 - }, - { - "epoch": 0.13, - "grad_norm": 0.33704888820648193, - "learning_rate": 0.00014452261306532665, - "loss": 1.3065, - "step": 281 - }, - { - "epoch": 0.13, - "grad_norm": 0.3138297200202942, - "learning_rate": 0.000144321608040201, - "loss": 1.1864, - "step": 282 - }, - { - "epoch": 0.13, - "grad_norm": 0.31389278173446655, - "learning_rate": 0.00014412060301507537, - "loss": 1.2409, - "step": 283 - }, - { - "epoch": 0.13, - "grad_norm": 0.33956632018089294, - "learning_rate": 0.00014391959798994977, - "loss": 1.2591, - "step": 284 - }, - { - "epoch": 0.13, - "grad_norm": 0.3188384473323822, - "learning_rate": 0.00014371859296482411, - "loss": 1.232, - "step": 285 - }, - { - "epoch": 0.13, - "grad_norm": 0.36883220076560974, - "learning_rate": 0.0001435175879396985, - "loss": 1.2569, - "step": 286 - }, - { - "epoch": 0.13, - "grad_norm": 0.3178212344646454, - "learning_rate": 0.00014331658291457286, - "loss": 1.2337, - "step": 287 - }, - { - "epoch": 0.13, - "grad_norm": 0.33480581641197205, - "learning_rate": 0.00014311557788944726, - "loss": 1.2784, - "step": 288 - }, - { - "epoch": 0.13, - "grad_norm": 0.3228307366371155, - "learning_rate": 0.0001429145728643216, - "loss": 1.2186, - "step": 289 - }, - { - "epoch": 0.13, - "grad_norm": 0.3270285427570343, - "learning_rate": 0.00014271356783919598, - "loss": 1.252, - "step": 290 - }, - { - "epoch": 0.13, - "grad_norm": 0.32656142115592957, - "learning_rate": 0.00014251256281407035, - "loss": 1.2598, - "step": 291 - }, - { - "epoch": 0.13, - "grad_norm": 0.3287805914878845, - "learning_rate": 0.00014231155778894473, - "loss": 1.2528, - "step": 292 - }, - { - "epoch": 0.13, - "grad_norm": 0.351793110370636, - "learning_rate": 0.0001421105527638191, - "loss": 1.1774, - "step": 293 - }, - { - "epoch": 0.13, - "grad_norm": 0.334957093000412, - "learning_rate": 0.00014190954773869347, - "loss": 1.2204, - "step": 294 - }, - { - "epoch": 0.13, - "grad_norm": 0.3303321897983551, - "learning_rate": 0.00014170854271356784, - "loss": 1.2792, - "step": 295 - }, - { - "epoch": 0.14, - "grad_norm": 0.325514018535614, - "learning_rate": 0.00014150753768844222, - "loss": 1.2095, - "step": 296 - }, - { - "epoch": 0.14, - "grad_norm": 0.32064923644065857, - "learning_rate": 0.0001413065326633166, - "loss": 1.2369, - "step": 297 - }, - { - "epoch": 0.14, - "grad_norm": 0.3173045516014099, - "learning_rate": 0.00014110552763819096, - "loss": 1.2092, - "step": 298 - }, - { - "epoch": 0.14, - "grad_norm": 0.3447834551334381, - "learning_rate": 0.00014090452261306534, - "loss": 1.3058, - "step": 299 - }, - { - "epoch": 0.14, - "grad_norm": 0.355277419090271, - "learning_rate": 0.0001407035175879397, - "loss": 1.2147, - "step": 300 - }, - { - "epoch": 0.14, - "grad_norm": 0.321415513753891, - "learning_rate": 0.00014050251256281408, - "loss": 1.3128, - "step": 301 - }, - { - "epoch": 0.14, - "grad_norm": 0.316572904586792, - "learning_rate": 0.00014030150753768846, - "loss": 1.2563, - "step": 302 - }, - { - "epoch": 0.14, - "grad_norm": 0.35804814100265503, - "learning_rate": 0.0001401005025125628, - "loss": 1.3382, - "step": 303 - }, - { - "epoch": 0.14, - "grad_norm": 0.32747843861579895, - "learning_rate": 0.0001398994974874372, - "loss": 1.198, - "step": 304 - }, - { - "epoch": 0.14, - "grad_norm": 0.35342931747436523, - "learning_rate": 0.00013969849246231157, - "loss": 1.2672, - "step": 305 - }, - { - "epoch": 0.14, - "grad_norm": 0.32692718505859375, - "learning_rate": 0.00013949748743718595, - "loss": 1.2641, - "step": 306 - }, - { - "epoch": 0.14, - "grad_norm": 0.31664589047431946, - "learning_rate": 0.0001392964824120603, - "loss": 1.2382, - "step": 307 - }, - { - "epoch": 0.14, - "grad_norm": 0.33936336636543274, - "learning_rate": 0.00013909547738693467, - "loss": 1.2823, - "step": 308 - }, - { - "epoch": 0.14, - "grad_norm": 0.342006117105484, - "learning_rate": 0.00013889447236180907, - "loss": 1.2712, - "step": 309 - }, - { - "epoch": 0.14, - "grad_norm": 0.31698647141456604, - "learning_rate": 0.0001386934673366834, - "loss": 1.2551, - "step": 310 - }, - { - "epoch": 0.14, - "grad_norm": 0.32440513372421265, - "learning_rate": 0.00013849246231155778, - "loss": 1.2512, - "step": 311 - }, - { - "epoch": 0.14, - "grad_norm": 0.3394576907157898, - "learning_rate": 0.00013829145728643216, - "loss": 1.3612, - "step": 312 - }, - { - "epoch": 0.14, - "grad_norm": 0.3272732198238373, - "learning_rate": 0.00013809045226130656, - "loss": 1.1537, - "step": 313 - }, - { - "epoch": 0.14, - "grad_norm": 0.33435794711112976, - "learning_rate": 0.0001378894472361809, - "loss": 1.2832, - "step": 314 - }, - { - "epoch": 0.14, - "grad_norm": 0.3546105921268463, - "learning_rate": 0.00013768844221105528, - "loss": 1.2398, - "step": 315 - }, - { - "epoch": 0.14, - "grad_norm": 0.3501565754413605, - "learning_rate": 0.00013748743718592965, - "loss": 1.2111, - "step": 316 - }, - { - "epoch": 0.14, - "grad_norm": 0.35097482800483704, - "learning_rate": 0.00013728643216080402, - "loss": 1.3033, - "step": 317 - }, - { - "epoch": 0.15, - "grad_norm": 0.3313996195793152, - "learning_rate": 0.0001370854271356784, - "loss": 1.2819, - "step": 318 - }, - { - "epoch": 0.15, - "grad_norm": 0.32861942052841187, - "learning_rate": 0.00013688442211055277, - "loss": 1.2566, - "step": 319 - }, - { - "epoch": 0.15, - "grad_norm": 0.33757033944129944, - "learning_rate": 0.00013668341708542714, - "loss": 1.2951, - "step": 320 - }, - { - "epoch": 0.15, - "grad_norm": 0.33206436038017273, - "learning_rate": 0.00013648241206030151, - "loss": 1.1828, - "step": 321 - }, - { - "epoch": 0.15, - "grad_norm": 0.34780171513557434, - "learning_rate": 0.0001362814070351759, - "loss": 1.2766, - "step": 322 - }, - { - "epoch": 0.15, - "grad_norm": 0.3237352669239044, - "learning_rate": 0.00013608040201005026, - "loss": 1.2885, - "step": 323 - }, - { - "epoch": 0.15, - "grad_norm": 0.32298171520233154, - "learning_rate": 0.00013587939698492463, - "loss": 1.1687, - "step": 324 - }, - { - "epoch": 0.15, - "grad_norm": 0.3167840540409088, - "learning_rate": 0.000135678391959799, - "loss": 1.2617, - "step": 325 - }, - { - "epoch": 0.15, - "grad_norm": 0.3790431618690491, - "learning_rate": 0.00013547738693467338, - "loss": 1.238, - "step": 326 - }, - { - "epoch": 0.15, - "grad_norm": 0.34648290276527405, - "learning_rate": 0.00013527638190954775, - "loss": 1.2802, - "step": 327 - }, - { - "epoch": 0.15, - "grad_norm": 0.32841557264328003, - "learning_rate": 0.0001350753768844221, - "loss": 1.2493, - "step": 328 - }, - { - "epoch": 0.15, - "grad_norm": 0.3556242287158966, - "learning_rate": 0.00013487437185929647, - "loss": 1.2269, - "step": 329 - }, - { - "epoch": 0.15, - "grad_norm": 0.38643625378608704, - "learning_rate": 0.00013467336683417087, - "loss": 1.3036, - "step": 330 - }, - { - "epoch": 0.15, - "grad_norm": 0.33449631929397583, - "learning_rate": 0.00013447236180904524, - "loss": 1.2595, - "step": 331 - }, - { - "epoch": 0.15, - "grad_norm": 0.33076462149620056, - "learning_rate": 0.0001342713567839196, - "loss": 1.2376, - "step": 332 - }, - { - "epoch": 0.15, - "grad_norm": 0.3205597698688507, - "learning_rate": 0.00013407035175879396, - "loss": 1.2323, - "step": 333 - }, - { - "epoch": 0.15, - "grad_norm": 0.31788671016693115, - "learning_rate": 0.00013386934673366836, - "loss": 1.2318, - "step": 334 - }, - { - "epoch": 0.15, - "grad_norm": 0.3543769419193268, - "learning_rate": 0.0001336683417085427, - "loss": 1.2916, - "step": 335 - }, - { - "epoch": 0.15, - "grad_norm": 0.31310132145881653, - "learning_rate": 0.00013346733668341708, - "loss": 1.1925, - "step": 336 - }, - { - "epoch": 0.15, - "grad_norm": 0.3153649866580963, - "learning_rate": 0.00013326633165829146, - "loss": 1.282, - "step": 337 - }, - { - "epoch": 0.15, - "grad_norm": 0.33127865195274353, - "learning_rate": 0.00013306532663316586, - "loss": 1.2753, - "step": 338 - }, - { - "epoch": 0.15, - "grad_norm": 0.36392638087272644, - "learning_rate": 0.0001328643216080402, - "loss": 1.2895, - "step": 339 - }, - { - "epoch": 0.16, - "grad_norm": 0.33100610971450806, - "learning_rate": 0.00013266331658291457, - "loss": 1.2334, - "step": 340 - }, - { - "epoch": 0.16, - "grad_norm": 0.3381032347679138, - "learning_rate": 0.00013246231155778895, - "loss": 1.2819, - "step": 341 - }, - { - "epoch": 0.16, - "grad_norm": 0.31431353092193604, - "learning_rate": 0.00013226130653266332, - "loss": 1.2468, - "step": 342 - }, - { - "epoch": 0.16, - "grad_norm": 0.31695157289505005, - "learning_rate": 0.0001320603015075377, - "loss": 1.2349, - "step": 343 - }, - { - "epoch": 0.16, - "grad_norm": 0.3341975808143616, - "learning_rate": 0.00013185929648241207, - "loss": 1.266, - "step": 344 - }, - { - "epoch": 0.16, - "grad_norm": 0.32825469970703125, - "learning_rate": 0.00013165829145728644, - "loss": 1.28, - "step": 345 - }, - { - "epoch": 0.16, - "grad_norm": 0.3523416817188263, - "learning_rate": 0.0001314572864321608, - "loss": 1.1085, - "step": 346 - }, - { - "epoch": 0.16, - "grad_norm": 0.3559586703777313, - "learning_rate": 0.00013125628140703518, - "loss": 1.3506, - "step": 347 - }, - { - "epoch": 0.16, - "grad_norm": 0.33866238594055176, - "learning_rate": 0.00013105527638190956, - "loss": 1.1631, - "step": 348 - }, - { - "epoch": 0.16, - "grad_norm": 0.3517313301563263, - "learning_rate": 0.00013085427135678393, - "loss": 1.2348, - "step": 349 - }, - { - "epoch": 0.16, - "grad_norm": 0.3219257891178131, - "learning_rate": 0.0001306532663316583, - "loss": 1.1565, - "step": 350 - }, - { - "epoch": 0.16, - "grad_norm": 0.3182775676250458, - "learning_rate": 0.00013045226130653268, - "loss": 1.3332, - "step": 351 - }, - { - "epoch": 0.16, - "grad_norm": 0.33609387278556824, - "learning_rate": 0.00013025125628140705, - "loss": 1.2197, - "step": 352 - }, - { - "epoch": 0.16, - "grad_norm": 0.3392961621284485, - "learning_rate": 0.0001300502512562814, - "loss": 1.3303, - "step": 353 - }, - { - "epoch": 0.16, - "grad_norm": 0.3233344256877899, - "learning_rate": 0.00012984924623115577, - "loss": 1.1869, - "step": 354 - }, - { - "epoch": 0.16, - "grad_norm": 0.3254396617412567, - "learning_rate": 0.00012964824120603017, - "loss": 1.2693, - "step": 355 - }, - { - "epoch": 0.16, - "grad_norm": 0.3262885510921478, - "learning_rate": 0.00012944723618090454, - "loss": 1.2837, - "step": 356 - }, - { - "epoch": 0.16, - "grad_norm": 0.3186768889427185, - "learning_rate": 0.0001292462311557789, - "loss": 1.2705, - "step": 357 - }, - { - "epoch": 0.16, - "grad_norm": 0.3470524251461029, - "learning_rate": 0.00012904522613065326, - "loss": 1.2626, - "step": 358 - }, - { - "epoch": 0.16, - "grad_norm": 0.31644207239151, - "learning_rate": 0.00012884422110552766, - "loss": 1.2217, - "step": 359 - }, - { - "epoch": 0.16, - "grad_norm": 0.3402186930179596, - "learning_rate": 0.000128643216080402, - "loss": 1.2456, - "step": 360 - }, - { - "epoch": 0.17, - "grad_norm": 0.32568660378456116, - "learning_rate": 0.00012844221105527638, - "loss": 1.3024, - "step": 361 - }, - { - "epoch": 0.17, - "grad_norm": 0.31550562381744385, - "learning_rate": 0.00012824120603015075, - "loss": 1.2512, - "step": 362 - }, - { - "epoch": 0.17, - "grad_norm": 0.3515610098838806, - "learning_rate": 0.00012804020100502515, - "loss": 1.2323, - "step": 363 - }, - { - "epoch": 0.17, - "grad_norm": 0.3370158076286316, - "learning_rate": 0.0001278391959798995, - "loss": 1.3072, - "step": 364 - }, - { - "epoch": 0.17, - "grad_norm": 0.3440285623073578, - "learning_rate": 0.00012763819095477387, - "loss": 1.2268, - "step": 365 - }, - { - "epoch": 0.17, - "grad_norm": 0.37662672996520996, - "learning_rate": 0.00012743718592964824, - "loss": 1.2495, - "step": 366 - }, - { - "epoch": 0.17, - "grad_norm": 0.32195794582366943, - "learning_rate": 0.00012723618090452262, - "loss": 1.2977, - "step": 367 - }, - { - "epoch": 0.17, - "grad_norm": 0.3311251103878021, - "learning_rate": 0.000127035175879397, - "loss": 1.3372, - "step": 368 - }, - { - "epoch": 0.17, - "grad_norm": 0.3319653272628784, - "learning_rate": 0.00012683417085427136, - "loss": 1.2813, - "step": 369 - }, - { - "epoch": 0.17, - "grad_norm": 0.3444850742816925, - "learning_rate": 0.00012663316582914574, - "loss": 1.2877, - "step": 370 - }, - { - "epoch": 0.17, - "grad_norm": 0.3433425724506378, - "learning_rate": 0.0001264321608040201, - "loss": 1.2696, - "step": 371 - }, - { - "epoch": 0.17, - "grad_norm": 0.35098111629486084, - "learning_rate": 0.00012623115577889448, - "loss": 1.28, - "step": 372 - }, - { - "epoch": 0.17, - "grad_norm": 0.3203146457672119, - "learning_rate": 0.00012603015075376885, - "loss": 1.2533, - "step": 373 - }, - { - "epoch": 0.17, - "grad_norm": 0.30470582842826843, - "learning_rate": 0.00012582914572864323, - "loss": 1.2678, - "step": 374 - }, - { - "epoch": 0.17, - "grad_norm": 0.324220210313797, - "learning_rate": 0.0001256281407035176, - "loss": 1.1909, - "step": 375 - }, - { - "epoch": 0.17, - "grad_norm": 0.3313479423522949, - "learning_rate": 0.00012542713567839197, - "loss": 1.2528, - "step": 376 - }, - { - "epoch": 0.17, - "grad_norm": 0.31879833340644836, - "learning_rate": 0.00012522613065326635, - "loss": 1.1431, - "step": 377 - }, - { - "epoch": 0.17, - "grad_norm": 0.3483116626739502, - "learning_rate": 0.0001250251256281407, - "loss": 1.1517, - "step": 378 - }, - { - "epoch": 0.17, - "grad_norm": 0.3220193684101105, - "learning_rate": 0.00012482412060301507, - "loss": 1.2532, - "step": 379 - }, - { - "epoch": 0.17, - "grad_norm": 0.3391655683517456, - "learning_rate": 0.00012462311557788947, - "loss": 1.2565, - "step": 380 - }, - { - "epoch": 0.17, - "grad_norm": 0.3446550667285919, - "learning_rate": 0.00012442211055276384, - "loss": 1.2253, - "step": 381 - }, - { - "epoch": 0.17, - "grad_norm": 0.3528743386268616, - "learning_rate": 0.00012422110552763818, - "loss": 1.2919, - "step": 382 - }, - { - "epoch": 0.18, - "grad_norm": 0.32574883103370667, - "learning_rate": 0.00012402010050251256, - "loss": 1.2885, - "step": 383 - }, - { - "epoch": 0.18, - "grad_norm": 0.3145955502986908, - "learning_rate": 0.00012381909547738696, - "loss": 1.2307, - "step": 384 - }, - { - "epoch": 0.18, - "grad_norm": 0.3239680230617523, - "learning_rate": 0.0001236180904522613, - "loss": 1.2612, - "step": 385 - }, - { - "epoch": 0.18, - "grad_norm": 0.3375207185745239, - "learning_rate": 0.00012341708542713568, - "loss": 1.274, - "step": 386 - }, - { - "epoch": 0.18, - "grad_norm": 0.3346465528011322, - "learning_rate": 0.00012321608040201005, - "loss": 1.2074, - "step": 387 - }, - { - "epoch": 0.18, - "grad_norm": 0.3280505836009979, - "learning_rate": 0.00012301507537688445, - "loss": 1.2776, - "step": 388 - }, - { - "epoch": 0.18, - "grad_norm": 0.3411586880683899, - "learning_rate": 0.0001228140703517588, - "loss": 1.3012, - "step": 389 - }, - { - "epoch": 0.18, - "grad_norm": 0.32394883036613464, - "learning_rate": 0.00012261306532663317, - "loss": 1.3018, - "step": 390 - }, - { - "epoch": 0.18, - "grad_norm": 0.34626421332359314, - "learning_rate": 0.00012241206030150754, - "loss": 1.2882, - "step": 391 - }, - { - "epoch": 0.18, - "grad_norm": 0.3305688500404358, - "learning_rate": 0.00012221105527638191, - "loss": 1.2484, - "step": 392 - }, - { - "epoch": 0.18, - "grad_norm": 0.33277568221092224, - "learning_rate": 0.00012201005025125629, - "loss": 1.2088, - "step": 393 - }, - { - "epoch": 0.18, - "grad_norm": 0.3431893289089203, - "learning_rate": 0.00012180904522613066, - "loss": 1.1882, - "step": 394 - }, - { - "epoch": 0.18, - "grad_norm": 0.3225569725036621, - "learning_rate": 0.00012160804020100502, - "loss": 1.2162, - "step": 395 - }, - { - "epoch": 0.18, - "grad_norm": 0.33999207615852356, - "learning_rate": 0.00012140703517587942, - "loss": 1.2532, - "step": 396 - }, - { - "epoch": 0.18, - "grad_norm": 0.3457259237766266, - "learning_rate": 0.00012120603015075378, - "loss": 1.2045, - "step": 397 - }, - { - "epoch": 0.18, - "grad_norm": 0.35479119420051575, - "learning_rate": 0.00012100502512562815, - "loss": 1.2475, - "step": 398 - }, - { - "epoch": 0.18, - "grad_norm": 0.34909576177597046, - "learning_rate": 0.00012080402010050251, - "loss": 1.2581, - "step": 399 - }, - { - "epoch": 0.18, - "grad_norm": 0.33657559752464294, - "learning_rate": 0.00012060301507537688, - "loss": 1.1876, - "step": 400 - }, - { - "epoch": 0.18, - "grad_norm": 0.33252567052841187, - "learning_rate": 0.00012040201005025127, - "loss": 1.2108, - "step": 401 - }, - { - "epoch": 0.18, - "grad_norm": 0.35932156443595886, - "learning_rate": 0.00012020100502512563, - "loss": 1.25, - "step": 402 - }, - { - "epoch": 0.18, - "grad_norm": 0.3509422242641449, - "learning_rate": 0.00012, - "loss": 1.2748, - "step": 403 - }, - { - "epoch": 0.18, - "grad_norm": 0.3509500324726105, - "learning_rate": 0.00011979899497487436, - "loss": 1.2704, - "step": 404 - }, - { - "epoch": 0.19, - "grad_norm": 0.32239145040512085, - "learning_rate": 0.00011959798994974876, - "loss": 1.2476, - "step": 405 - }, - { - "epoch": 0.19, - "grad_norm": 0.33603423833847046, - "learning_rate": 0.00011939698492462312, - "loss": 1.278, - "step": 406 - }, - { - "epoch": 0.19, - "grad_norm": 0.3381786346435547, - "learning_rate": 0.0001191959798994975, - "loss": 1.2382, - "step": 407 - }, - { - "epoch": 0.19, - "grad_norm": 0.31310775876045227, - "learning_rate": 0.00011899497487437185, - "loss": 1.2827, - "step": 408 - }, - { - "epoch": 0.19, - "grad_norm": 0.3387271463871002, - "learning_rate": 0.00011879396984924624, - "loss": 1.2891, - "step": 409 - }, - { - "epoch": 0.19, - "grad_norm": 0.3353903293609619, - "learning_rate": 0.00011859296482412061, - "loss": 1.326, - "step": 410 - }, - { - "epoch": 0.19, - "grad_norm": 0.322992742061615, - "learning_rate": 0.00011839195979899497, - "loss": 1.2513, - "step": 411 - }, - { - "epoch": 0.19, - "grad_norm": 0.3425077199935913, - "learning_rate": 0.00011819095477386935, - "loss": 1.1482, - "step": 412 - }, - { - "epoch": 0.19, - "grad_norm": 0.3305937647819519, - "learning_rate": 0.00011798994974874373, - "loss": 1.1919, - "step": 413 - }, - { - "epoch": 0.19, - "grad_norm": 0.3408913016319275, - "learning_rate": 0.0001177889447236181, - "loss": 1.2535, - "step": 414 - }, - { - "epoch": 0.19, - "grad_norm": 0.43716689944267273, - "learning_rate": 0.00011758793969849247, - "loss": 1.33, - "step": 415 - }, - { - "epoch": 0.19, - "grad_norm": 0.34090203046798706, - "learning_rate": 0.00011738693467336684, - "loss": 1.1666, - "step": 416 - }, - { - "epoch": 0.19, - "grad_norm": 0.35914671421051025, - "learning_rate": 0.00011718592964824122, - "loss": 1.2683, - "step": 417 - }, - { - "epoch": 0.19, - "grad_norm": 0.3459693193435669, - "learning_rate": 0.00011698492462311558, - "loss": 1.2502, - "step": 418 - }, - { - "epoch": 0.19, - "grad_norm": 0.3254222273826599, - "learning_rate": 0.00011678391959798996, - "loss": 1.2369, - "step": 419 - }, - { - "epoch": 0.19, - "grad_norm": 0.33233174681663513, - "learning_rate": 0.00011658291457286432, - "loss": 1.1769, - "step": 420 - }, - { - "epoch": 0.19, - "grad_norm": 0.3394586145877838, - "learning_rate": 0.00011638190954773872, - "loss": 1.1441, - "step": 421 - }, - { - "epoch": 0.19, - "grad_norm": 0.3482055962085724, - "learning_rate": 0.00011618090452261308, - "loss": 1.1411, - "step": 422 - }, - { - "epoch": 0.19, - "grad_norm": 0.33943256735801697, - "learning_rate": 0.00011597989949748745, - "loss": 1.274, - "step": 423 - }, - { - "epoch": 0.19, - "grad_norm": 0.34545761346817017, - "learning_rate": 0.00011577889447236181, - "loss": 1.1839, - "step": 424 - }, - { - "epoch": 0.19, - "grad_norm": 0.3279217481613159, - "learning_rate": 0.00011557788944723618, - "loss": 1.2555, - "step": 425 - }, - { - "epoch": 0.19, - "grad_norm": 0.3297037184238434, - "learning_rate": 0.00011537688442211057, - "loss": 1.2645, - "step": 426 - }, - { - "epoch": 0.2, - "grad_norm": 0.3765062391757965, - "learning_rate": 0.00011517587939698493, - "loss": 1.2166, - "step": 427 - }, - { - "epoch": 0.2, - "grad_norm": 0.34099629521369934, - "learning_rate": 0.0001149748743718593, - "loss": 1.1985, - "step": 428 - }, - { - "epoch": 0.2, - "grad_norm": 0.3941348195075989, - "learning_rate": 0.00011477386934673366, - "loss": 1.2124, - "step": 429 - }, - { - "epoch": 0.2, - "grad_norm": 0.3421550989151001, - "learning_rate": 0.00011457286432160806, - "loss": 1.2202, - "step": 430 - }, - { - "epoch": 0.2, - "grad_norm": 0.3587627708911896, - "learning_rate": 0.00011437185929648242, - "loss": 1.2119, - "step": 431 - }, - { - "epoch": 0.2, - "grad_norm": 0.318024605512619, - "learning_rate": 0.00011417085427135679, - "loss": 1.1717, - "step": 432 - }, - { - "epoch": 0.2, - "grad_norm": 0.3441738486289978, - "learning_rate": 0.00011396984924623115, - "loss": 1.2637, - "step": 433 - }, - { - "epoch": 0.2, - "grad_norm": 0.32831835746765137, - "learning_rate": 0.00011376884422110554, - "loss": 1.1351, - "step": 434 - }, - { - "epoch": 0.2, - "grad_norm": 0.40580299496650696, - "learning_rate": 0.00011356783919597991, - "loss": 1.2777, - "step": 435 - }, - { - "epoch": 0.2, - "grad_norm": 0.3455897867679596, - "learning_rate": 0.00011336683417085427, - "loss": 1.2711, - "step": 436 - }, - { - "epoch": 0.2, - "grad_norm": 0.34554949402809143, - "learning_rate": 0.00011316582914572864, - "loss": 1.2605, - "step": 437 - }, - { - "epoch": 0.2, - "grad_norm": 0.333046019077301, - "learning_rate": 0.00011296482412060303, - "loss": 1.1934, - "step": 438 - }, - { - "epoch": 0.2, - "grad_norm": 0.37090611457824707, - "learning_rate": 0.0001127638190954774, - "loss": 1.2289, - "step": 439 - }, - { - "epoch": 0.2, - "grad_norm": 0.3464914560317993, - "learning_rate": 0.00011256281407035176, - "loss": 1.2573, - "step": 440 - }, - { - "epoch": 0.2, - "grad_norm": 0.3522966504096985, - "learning_rate": 0.00011236180904522614, - "loss": 1.233, - "step": 441 - }, - { - "epoch": 0.2, - "grad_norm": 0.34124764800071716, - "learning_rate": 0.00011216080402010052, - "loss": 1.2167, - "step": 442 - }, - { - "epoch": 0.2, - "grad_norm": 0.32103344798088074, - "learning_rate": 0.00011195979899497488, - "loss": 1.2823, - "step": 443 - }, - { - "epoch": 0.2, - "grad_norm": 0.35400694608688354, - "learning_rate": 0.00011175879396984925, - "loss": 1.275, - "step": 444 - }, - { - "epoch": 0.2, - "grad_norm": 0.32526448369026184, - "learning_rate": 0.00011155778894472361, - "loss": 1.2298, - "step": 445 - }, - { - "epoch": 0.2, - "grad_norm": 0.32495397329330444, - "learning_rate": 0.00011135678391959799, - "loss": 1.2328, - "step": 446 - }, - { - "epoch": 0.2, - "grad_norm": 0.33901894092559814, - "learning_rate": 0.00011115577889447237, - "loss": 1.2864, - "step": 447 - }, - { - "epoch": 0.2, - "grad_norm": 0.3219538927078247, - "learning_rate": 0.00011095477386934675, - "loss": 1.1061, - "step": 448 - }, - { - "epoch": 0.21, - "grad_norm": 0.3175276219844818, - "learning_rate": 0.0001107537688442211, - "loss": 1.2714, - "step": 449 - }, - { - "epoch": 0.21, - "grad_norm": 0.32311904430389404, - "learning_rate": 0.00011055276381909548, - "loss": 1.2635, - "step": 450 - }, - { - "epoch": 0.21, - "grad_norm": 0.3252653181552887, - "learning_rate": 0.00011035175879396986, - "loss": 1.2694, - "step": 451 - }, - { - "epoch": 0.21, - "grad_norm": 0.337410032749176, - "learning_rate": 0.00011015075376884422, - "loss": 1.2197, - "step": 452 - }, - { - "epoch": 0.21, - "grad_norm": 0.34929850697517395, - "learning_rate": 0.0001099497487437186, - "loss": 1.2773, - "step": 453 - }, - { - "epoch": 0.21, - "grad_norm": 0.3558543622493744, - "learning_rate": 0.00010974874371859296, - "loss": 1.2673, - "step": 454 - }, - { - "epoch": 0.21, - "grad_norm": 0.309593141078949, - "learning_rate": 0.00010954773869346736, - "loss": 1.2945, - "step": 455 - }, - { - "epoch": 0.21, - "grad_norm": 0.30904704332351685, - "learning_rate": 0.00010934673366834172, - "loss": 1.2362, - "step": 456 - }, - { - "epoch": 0.21, - "grad_norm": 0.3560062646865845, - "learning_rate": 0.00010914572864321609, - "loss": 1.2874, - "step": 457 - }, - { - "epoch": 0.21, - "grad_norm": 0.3410942554473877, - "learning_rate": 0.00010894472361809045, - "loss": 1.2621, - "step": 458 - }, - { - "epoch": 0.21, - "grad_norm": 0.33610406517982483, - "learning_rate": 0.00010874371859296483, - "loss": 1.2572, - "step": 459 - }, - { - "epoch": 0.21, - "grad_norm": 0.3685830235481262, - "learning_rate": 0.00010854271356783921, - "loss": 1.292, - "step": 460 - }, - { - "epoch": 0.21, - "grad_norm": 0.3263039290904999, - "learning_rate": 0.00010834170854271357, - "loss": 1.1467, - "step": 461 - }, - { - "epoch": 0.21, - "grad_norm": 0.33784759044647217, - "learning_rate": 0.00010814070351758794, - "loss": 1.2536, - "step": 462 - }, - { - "epoch": 0.21, - "grad_norm": 0.3310985565185547, - "learning_rate": 0.00010793969849246233, - "loss": 1.2865, - "step": 463 - }, - { - "epoch": 0.21, - "grad_norm": 0.3608328104019165, - "learning_rate": 0.0001077386934673367, - "loss": 1.3144, - "step": 464 - }, - { - "epoch": 0.21, - "grad_norm": 0.3107350468635559, - "learning_rate": 0.00010753768844221106, - "loss": 1.1831, - "step": 465 - }, - { - "epoch": 0.21, - "grad_norm": 0.3376270532608032, - "learning_rate": 0.00010733668341708543, - "loss": 1.2699, - "step": 466 - }, - { - "epoch": 0.21, - "grad_norm": 0.34757518768310547, - "learning_rate": 0.00010713567839195982, - "loss": 1.3423, - "step": 467 - }, - { - "epoch": 0.21, - "grad_norm": 0.3217342495918274, - "learning_rate": 0.00010693467336683418, - "loss": 1.2629, - "step": 468 - }, - { - "epoch": 0.21, - "grad_norm": 0.3594968020915985, - "learning_rate": 0.00010673366834170855, - "loss": 1.2613, - "step": 469 - }, - { - "epoch": 0.21, - "grad_norm": 0.34216034412384033, - "learning_rate": 0.00010653266331658291, - "loss": 1.2807, - "step": 470 - }, - { - "epoch": 0.22, - "grad_norm": 0.33661434054374695, - "learning_rate": 0.00010633165829145728, - "loss": 1.2175, - "step": 471 - }, - { - "epoch": 0.22, - "grad_norm": 0.3459634780883789, - "learning_rate": 0.00010613065326633167, - "loss": 1.2234, - "step": 472 - }, - { - "epoch": 0.22, - "grad_norm": 0.31939029693603516, - "learning_rate": 0.00010592964824120604, - "loss": 1.2566, - "step": 473 - }, - { - "epoch": 0.22, - "grad_norm": 0.3308617174625397, - "learning_rate": 0.0001057286432160804, - "loss": 1.2249, - "step": 474 - }, - { - "epoch": 0.22, - "grad_norm": 0.3457432985305786, - "learning_rate": 0.00010552763819095478, - "loss": 1.3377, - "step": 475 - }, - { - "epoch": 0.22, - "grad_norm": 0.3623298704624176, - "learning_rate": 0.00010532663316582916, - "loss": 1.295, - "step": 476 - }, - { - "epoch": 0.22, - "grad_norm": 0.3287794888019562, - "learning_rate": 0.00010512562814070352, - "loss": 1.3167, - "step": 477 - }, - { - "epoch": 0.22, - "grad_norm": 0.32969963550567627, - "learning_rate": 0.0001049246231155779, - "loss": 1.1917, - "step": 478 - }, - { - "epoch": 0.22, - "grad_norm": 0.3520050346851349, - "learning_rate": 0.00010472361809045225, - "loss": 1.2786, - "step": 479 - }, - { - "epoch": 0.22, - "grad_norm": 0.33835569024086, - "learning_rate": 0.00010452261306532664, - "loss": 1.2734, - "step": 480 - }, - { - "epoch": 0.22, - "grad_norm": 0.32975468039512634, - "learning_rate": 0.00010432160804020101, - "loss": 1.2263, - "step": 481 - }, - { - "epoch": 0.22, - "grad_norm": 0.3600429594516754, - "learning_rate": 0.00010412060301507539, - "loss": 1.308, - "step": 482 - }, - { - "epoch": 0.22, - "grad_norm": 0.35504522919654846, - "learning_rate": 0.00010391959798994975, - "loss": 1.3255, - "step": 483 - }, - { - "epoch": 0.22, - "grad_norm": 0.334204763174057, - "learning_rate": 0.00010371859296482413, - "loss": 1.2029, - "step": 484 - }, - { - "epoch": 0.22, - "grad_norm": 0.32885733246803284, - "learning_rate": 0.0001035175879396985, - "loss": 1.258, - "step": 485 - }, - { - "epoch": 0.22, - "grad_norm": 0.3293534815311432, - "learning_rate": 0.00010331658291457286, - "loss": 1.2438, - "step": 486 - }, - { - "epoch": 0.22, - "grad_norm": 0.3399008810520172, - "learning_rate": 0.00010311557788944724, - "loss": 1.2508, - "step": 487 - }, - { - "epoch": 0.22, - "grad_norm": 0.3626408576965332, - "learning_rate": 0.00010291457286432162, - "loss": 1.2555, - "step": 488 - }, - { - "epoch": 0.22, - "grad_norm": 0.3435054123401642, - "learning_rate": 0.00010271356783919598, - "loss": 1.1557, - "step": 489 - }, - { - "epoch": 0.22, - "grad_norm": 0.35252466797828674, - "learning_rate": 0.00010251256281407036, - "loss": 1.2756, - "step": 490 - }, - { - "epoch": 0.22, - "grad_norm": 0.3346278667449951, - "learning_rate": 0.00010231155778894473, - "loss": 1.2722, - "step": 491 - }, - { - "epoch": 0.22, - "grad_norm": 0.33955395221710205, - "learning_rate": 0.00010211055276381909, - "loss": 1.2445, - "step": 492 - }, - { - "epoch": 0.23, - "grad_norm": 0.33230292797088623, - "learning_rate": 0.00010190954773869348, - "loss": 1.2836, - "step": 493 - }, - { - "epoch": 0.23, - "grad_norm": 0.32656341791152954, - "learning_rate": 0.00010170854271356785, - "loss": 1.2414, - "step": 494 - }, - { - "epoch": 0.23, - "grad_norm": 0.3479287922382355, - "learning_rate": 0.00010150753768844221, - "loss": 1.2442, - "step": 495 - }, - { - "epoch": 0.23, - "grad_norm": 0.3435857892036438, - "learning_rate": 0.00010130653266331658, - "loss": 1.2344, - "step": 496 - }, - { - "epoch": 0.23, - "grad_norm": 0.3324833810329437, - "learning_rate": 0.00010110552763819097, - "loss": 1.2261, - "step": 497 - }, - { - "epoch": 0.23, - "grad_norm": 0.3612833023071289, - "learning_rate": 0.00010090452261306533, - "loss": 1.3509, - "step": 498 - }, - { - "epoch": 0.23, - "grad_norm": 0.3381580710411072, - "learning_rate": 0.0001007035175879397, - "loss": 1.2508, - "step": 499 - }, - { - "epoch": 0.23, - "grad_norm": 0.3283715546131134, - "learning_rate": 0.00010050251256281407, - "loss": 1.223, - "step": 500 - }, - { - "epoch": 0.23, - "grad_norm": 0.3245822787284851, - "learning_rate": 0.00010030150753768846, - "loss": 1.214, - "step": 501 - }, - { - "epoch": 0.23, - "grad_norm": 0.33582326769828796, - "learning_rate": 0.00010010050251256282, - "loss": 1.1891, - "step": 502 - }, - { - "epoch": 0.23, - "grad_norm": 0.34178048372268677, - "learning_rate": 9.989949748743719e-05, - "loss": 1.2328, - "step": 503 - }, - { - "epoch": 0.23, - "grad_norm": 0.3429703414440155, - "learning_rate": 9.969849246231156e-05, - "loss": 1.243, - "step": 504 - }, - { - "epoch": 0.23, - "grad_norm": 0.3459545969963074, - "learning_rate": 9.949748743718594e-05, - "loss": 1.2272, - "step": 505 - }, - { - "epoch": 0.23, - "grad_norm": 0.32898616790771484, - "learning_rate": 9.929648241206031e-05, - "loss": 1.2651, - "step": 506 - }, - { - "epoch": 0.23, - "grad_norm": 0.33411911129951477, - "learning_rate": 9.909547738693468e-05, - "loss": 1.2215, - "step": 507 - }, - { - "epoch": 0.23, - "grad_norm": 0.3196270763874054, - "learning_rate": 9.889447236180906e-05, - "loss": 1.2338, - "step": 508 - }, - { - "epoch": 0.23, - "grad_norm": 0.3360273241996765, - "learning_rate": 9.869346733668342e-05, - "loss": 1.1259, - "step": 509 - }, - { - "epoch": 0.23, - "grad_norm": 0.3572694957256317, - "learning_rate": 9.84924623115578e-05, - "loss": 1.2741, - "step": 510 - }, - { - "epoch": 0.23, - "grad_norm": 0.3491540849208832, - "learning_rate": 9.829145728643216e-05, - "loss": 1.2844, - "step": 511 - }, - { - "epoch": 0.23, - "grad_norm": 0.3704037666320801, - "learning_rate": 9.809045226130655e-05, - "loss": 1.2974, - "step": 512 - }, - { - "epoch": 0.23, - "grad_norm": 0.3397068381309509, - "learning_rate": 9.788944723618091e-05, - "loss": 1.2155, - "step": 513 - }, - { - "epoch": 0.23, - "grad_norm": 0.3439743220806122, - "learning_rate": 9.768844221105528e-05, - "loss": 1.2449, - "step": 514 - }, - { - "epoch": 0.24, - "grad_norm": 0.3374169170856476, - "learning_rate": 9.748743718592965e-05, - "loss": 1.1984, - "step": 515 - }, - { - "epoch": 0.24, - "grad_norm": 0.3484777510166168, - "learning_rate": 9.728643216080403e-05, - "loss": 1.2707, - "step": 516 - }, - { - "epoch": 0.24, - "grad_norm": 0.34569051861763, - "learning_rate": 9.70854271356784e-05, - "loss": 1.2669, - "step": 517 - }, - { - "epoch": 0.24, - "grad_norm": 0.3295353949069977, - "learning_rate": 9.688442211055276e-05, - "loss": 1.3073, - "step": 518 - }, - { - "epoch": 0.24, - "grad_norm": 0.3298560380935669, - "learning_rate": 9.668341708542715e-05, - "loss": 1.1989, - "step": 519 - }, - { - "epoch": 0.24, - "grad_norm": 0.342427521944046, - "learning_rate": 9.64824120603015e-05, - "loss": 1.2467, - "step": 520 - }, - { - "epoch": 0.24, - "grad_norm": 0.34153419733047485, - "learning_rate": 9.628140703517589e-05, - "loss": 1.1852, - "step": 521 - }, - { - "epoch": 0.24, - "grad_norm": 0.33842480182647705, - "learning_rate": 9.608040201005025e-05, - "loss": 1.277, - "step": 522 - }, - { - "epoch": 0.24, - "grad_norm": 0.3495989739894867, - "learning_rate": 9.587939698492462e-05, - "loss": 1.2363, - "step": 523 - }, - { - "epoch": 0.24, - "grad_norm": 0.34229379892349243, - "learning_rate": 9.5678391959799e-05, - "loss": 1.2327, - "step": 524 - }, - { - "epoch": 0.24, - "grad_norm": 0.3576783835887909, - "learning_rate": 9.547738693467337e-05, - "loss": 1.2011, - "step": 525 - }, - { - "epoch": 0.24, - "grad_norm": 0.3553127646446228, - "learning_rate": 9.527638190954774e-05, - "loss": 1.2718, - "step": 526 - }, - { - "epoch": 0.24, - "grad_norm": 0.34771883487701416, - "learning_rate": 9.507537688442212e-05, - "loss": 1.2399, - "step": 527 - }, - { - "epoch": 0.24, - "grad_norm": 0.34981489181518555, - "learning_rate": 9.487437185929649e-05, - "loss": 1.2305, - "step": 528 - }, - { - "epoch": 0.24, - "grad_norm": 0.35138848423957825, - "learning_rate": 9.467336683417086e-05, - "loss": 1.3227, - "step": 529 - }, - { - "epoch": 0.24, - "grad_norm": 0.32845598459243774, - "learning_rate": 9.447236180904523e-05, - "loss": 1.3256, - "step": 530 - }, - { - "epoch": 0.24, - "grad_norm": 0.35754063725471497, - "learning_rate": 9.427135678391961e-05, - "loss": 1.28, - "step": 531 - }, - { - "epoch": 0.24, - "grad_norm": 0.3293386697769165, - "learning_rate": 9.407035175879397e-05, - "loss": 1.2435, - "step": 532 - }, - { - "epoch": 0.24, - "grad_norm": 0.32942119240760803, - "learning_rate": 9.386934673366835e-05, - "loss": 1.2703, - "step": 533 - }, - { - "epoch": 0.24, - "grad_norm": 0.36065587401390076, - "learning_rate": 9.366834170854271e-05, - "loss": 1.1724, - "step": 534 - }, - { - "epoch": 0.24, - "grad_norm": 0.32289040088653564, - "learning_rate": 9.34673366834171e-05, - "loss": 1.2722, - "step": 535 - }, - { - "epoch": 0.25, - "grad_norm": 0.3467089533805847, - "learning_rate": 9.326633165829146e-05, - "loss": 1.2233, - "step": 536 - }, - { - "epoch": 0.25, - "grad_norm": 0.3310029208660126, - "learning_rate": 9.306532663316585e-05, - "loss": 1.2507, - "step": 537 - }, - { - "epoch": 0.25, - "grad_norm": 0.3369174301624298, - "learning_rate": 9.28643216080402e-05, - "loss": 1.2127, - "step": 538 - }, - { - "epoch": 0.25, - "grad_norm": 0.34335601329803467, - "learning_rate": 9.266331658291458e-05, - "loss": 1.2038, - "step": 539 - }, - { - "epoch": 0.25, - "grad_norm": 0.3327116072177887, - "learning_rate": 9.246231155778895e-05, - "loss": 1.3228, - "step": 540 - }, - { - "epoch": 0.25, - "grad_norm": 0.3617047071456909, - "learning_rate": 9.226130653266331e-05, - "loss": 1.3661, - "step": 541 - }, - { - "epoch": 0.25, - "grad_norm": 0.3532153069972992, - "learning_rate": 9.20603015075377e-05, - "loss": 1.2159, - "step": 542 - }, - { - "epoch": 0.25, - "grad_norm": 0.3590898811817169, - "learning_rate": 9.185929648241206e-05, - "loss": 1.2507, - "step": 543 - }, - { - "epoch": 0.25, - "grad_norm": 0.3496554493904114, - "learning_rate": 9.165829145728644e-05, - "loss": 1.2163, - "step": 544 - }, - { - "epoch": 0.25, - "grad_norm": 0.34077826142311096, - "learning_rate": 9.14572864321608e-05, - "loss": 1.2127, - "step": 545 - }, - { - "epoch": 0.25, - "grad_norm": 0.33904626965522766, - "learning_rate": 9.125628140703519e-05, - "loss": 1.1938, - "step": 546 - }, - { - "epoch": 0.25, - "grad_norm": 0.34727028012275696, - "learning_rate": 9.105527638190955e-05, - "loss": 1.2685, - "step": 547 - }, - { - "epoch": 0.25, - "grad_norm": 0.3311704397201538, - "learning_rate": 9.085427135678392e-05, - "loss": 1.2014, - "step": 548 - }, - { - "epoch": 0.25, - "grad_norm": 0.3330560028553009, - "learning_rate": 9.06532663316583e-05, - "loss": 1.2679, - "step": 549 - }, - { - "epoch": 0.25, - "grad_norm": 0.3092857003211975, - "learning_rate": 9.045226130653267e-05, - "loss": 1.1688, - "step": 550 - }, - { - "epoch": 0.25, - "grad_norm": 0.3591139018535614, - "learning_rate": 9.025125628140704e-05, - "loss": 1.1914, - "step": 551 - }, - { - "epoch": 0.25, - "grad_norm": 0.3442589044570923, - "learning_rate": 9.005025125628141e-05, - "loss": 1.2051, - "step": 552 - }, - { - "epoch": 0.25, - "grad_norm": 0.35013848543167114, - "learning_rate": 8.984924623115579e-05, - "loss": 1.2408, - "step": 553 - }, - { - "epoch": 0.25, - "grad_norm": 0.3634118139743805, - "learning_rate": 8.964824120603016e-05, - "loss": 1.2016, - "step": 554 - }, - { - "epoch": 0.25, - "grad_norm": 0.32844385504722595, - "learning_rate": 8.944723618090453e-05, - "loss": 1.186, - "step": 555 - }, - { - "epoch": 0.25, - "grad_norm": 0.3430251181125641, - "learning_rate": 8.92462311557789e-05, - "loss": 1.3732, - "step": 556 - }, - { - "epoch": 0.25, - "grad_norm": 0.32605141401290894, - "learning_rate": 8.904522613065326e-05, - "loss": 1.2225, - "step": 557 - }, - { - "epoch": 0.26, - "grad_norm": 0.3491004407405853, - "learning_rate": 8.884422110552765e-05, - "loss": 1.277, - "step": 558 - }, - { - "epoch": 0.26, - "grad_norm": 0.34751296043395996, - "learning_rate": 8.864321608040201e-05, - "loss": 1.2069, - "step": 559 - }, - { - "epoch": 0.26, - "grad_norm": 0.31487154960632324, - "learning_rate": 8.84422110552764e-05, - "loss": 1.1973, - "step": 560 - }, - { - "epoch": 0.26, - "grad_norm": 0.33653488755226135, - "learning_rate": 8.824120603015076e-05, - "loss": 1.3103, - "step": 561 - }, - { - "epoch": 0.26, - "grad_norm": 0.3140471279621124, - "learning_rate": 8.804020100502513e-05, - "loss": 1.1919, - "step": 562 - }, - { - "epoch": 0.26, - "grad_norm": 0.37627485394477844, - "learning_rate": 8.78391959798995e-05, - "loss": 1.2185, - "step": 563 - }, - { - "epoch": 0.26, - "grad_norm": 0.34721139073371887, - "learning_rate": 8.763819095477387e-05, - "loss": 1.2937, - "step": 564 - }, - { - "epoch": 0.26, - "grad_norm": 0.31878143548965454, - "learning_rate": 8.743718592964825e-05, - "loss": 1.1966, - "step": 565 - }, - { - "epoch": 0.26, - "grad_norm": 0.3476640284061432, - "learning_rate": 8.723618090452261e-05, - "loss": 1.1035, - "step": 566 - }, - { - "epoch": 0.26, - "grad_norm": 0.32448452711105347, - "learning_rate": 8.7035175879397e-05, - "loss": 1.2245, - "step": 567 - }, - { - "epoch": 0.26, - "grad_norm": 0.3310175836086273, - "learning_rate": 8.683417085427135e-05, - "loss": 1.2231, - "step": 568 - }, - { - "epoch": 0.26, - "grad_norm": 0.3311636745929718, - "learning_rate": 8.663316582914574e-05, - "loss": 1.2363, - "step": 569 - }, - { - "epoch": 0.26, - "grad_norm": 0.3398733139038086, - "learning_rate": 8.64321608040201e-05, - "loss": 1.2684, - "step": 570 - }, - { - "epoch": 0.26, - "grad_norm": 0.35292455554008484, - "learning_rate": 8.623115577889449e-05, - "loss": 1.2251, - "step": 571 - }, - { - "epoch": 0.26, - "grad_norm": 0.3688778579235077, - "learning_rate": 8.603015075376884e-05, - "loss": 1.2493, - "step": 572 - }, - { - "epoch": 0.26, - "grad_norm": 0.3263533413410187, - "learning_rate": 8.582914572864322e-05, - "loss": 1.2479, - "step": 573 - }, - { - "epoch": 0.26, - "grad_norm": 0.362587034702301, - "learning_rate": 8.562814070351759e-05, - "loss": 1.2296, - "step": 574 - }, - { - "epoch": 0.26, - "grad_norm": 0.3232894241809845, - "learning_rate": 8.542713567839196e-05, - "loss": 1.1811, - "step": 575 - }, - { - "epoch": 0.26, - "grad_norm": 0.3491204082965851, - "learning_rate": 8.522613065326634e-05, - "loss": 1.306, - "step": 576 - }, - { - "epoch": 0.26, - "grad_norm": 0.3507627546787262, - "learning_rate": 8.502512562814071e-05, - "loss": 1.2438, - "step": 577 - }, - { - "epoch": 0.26, - "grad_norm": 0.34256500005722046, - "learning_rate": 8.482412060301508e-05, - "loss": 1.2765, - "step": 578 - }, - { - "epoch": 0.26, - "grad_norm": 0.333816260099411, - "learning_rate": 8.462311557788946e-05, - "loss": 1.2372, - "step": 579 - }, - { - "epoch": 0.27, - "grad_norm": 0.3247378170490265, - "learning_rate": 8.442211055276383e-05, - "loss": 1.2541, - "step": 580 - }, - { - "epoch": 0.27, - "grad_norm": 0.31364428997039795, - "learning_rate": 8.42211055276382e-05, - "loss": 1.2676, - "step": 581 - }, - { - "epoch": 0.27, - "grad_norm": 0.3368885815143585, - "learning_rate": 8.402010050251256e-05, - "loss": 1.214, - "step": 582 - }, - { - "epoch": 0.27, - "grad_norm": 0.3350681662559509, - "learning_rate": 8.381909547738695e-05, - "loss": 1.1894, - "step": 583 - }, - { - "epoch": 0.27, - "grad_norm": 0.3448706269264221, - "learning_rate": 8.36180904522613e-05, - "loss": 1.2945, - "step": 584 - }, - { - "epoch": 0.27, - "grad_norm": 0.3268083930015564, - "learning_rate": 8.341708542713568e-05, - "loss": 1.1992, - "step": 585 - }, - { - "epoch": 0.27, - "grad_norm": 0.3421708941459656, - "learning_rate": 8.321608040201005e-05, - "loss": 1.2229, - "step": 586 - }, - { - "epoch": 0.27, - "grad_norm": 0.3358671963214874, - "learning_rate": 8.301507537688443e-05, - "loss": 1.1979, - "step": 587 - }, - { - "epoch": 0.27, - "grad_norm": 0.3192279636859894, - "learning_rate": 8.28140703517588e-05, - "loss": 1.182, - "step": 588 - }, - { - "epoch": 0.27, - "grad_norm": 0.32525762915611267, - "learning_rate": 8.261306532663317e-05, - "loss": 1.2582, - "step": 589 - }, - { - "epoch": 0.27, - "grad_norm": 0.3301040232181549, - "learning_rate": 8.241206030150754e-05, - "loss": 1.1714, - "step": 590 - }, - { - "epoch": 0.27, - "grad_norm": 0.33362287282943726, - "learning_rate": 8.22110552763819e-05, - "loss": 1.2263, - "step": 591 - }, - { - "epoch": 0.27, - "grad_norm": 0.33714091777801514, - "learning_rate": 8.201005025125629e-05, - "loss": 1.2911, - "step": 592 - }, - { - "epoch": 0.27, - "grad_norm": 0.33390864729881287, - "learning_rate": 8.180904522613065e-05, - "loss": 1.3051, - "step": 593 - }, - { - "epoch": 0.27, - "grad_norm": 0.3238459527492523, - "learning_rate": 8.160804020100504e-05, - "loss": 1.2372, - "step": 594 - }, - { - "epoch": 0.27, - "grad_norm": 0.35366594791412354, - "learning_rate": 8.14070351758794e-05, - "loss": 1.1972, - "step": 595 - }, - { - "epoch": 0.27, - "grad_norm": 0.33592501282691956, - "learning_rate": 8.120603015075378e-05, - "loss": 1.2653, - "step": 596 - }, - { - "epoch": 0.27, - "grad_norm": 0.3256325423717499, - "learning_rate": 8.100502512562814e-05, - "loss": 1.2153, - "step": 597 - }, - { - "epoch": 0.27, - "grad_norm": 0.3295387029647827, - "learning_rate": 8.080402010050251e-05, - "loss": 1.2275, - "step": 598 - }, - { - "epoch": 0.27, - "grad_norm": 0.31947025656700134, - "learning_rate": 8.060301507537689e-05, - "loss": 1.1627, - "step": 599 - }, - { - "epoch": 0.27, - "grad_norm": 0.3440350890159607, - "learning_rate": 8.040201005025126e-05, - "loss": 1.2488, - "step": 600 - }, - { - "epoch": 0.27, - "grad_norm": 0.31828573346138, - "learning_rate": 8.020100502512563e-05, - "loss": 1.1908, - "step": 601 - }, - { - "epoch": 0.28, - "grad_norm": 0.31716206669807434, - "learning_rate": 8e-05, - "loss": 1.1661, - "step": 602 - }, - { - "epoch": 0.28, - "grad_norm": 0.3545120358467102, - "learning_rate": 7.979899497487438e-05, - "loss": 1.1378, - "step": 603 - }, - { - "epoch": 0.28, - "grad_norm": 0.3279556930065155, - "learning_rate": 7.959798994974875e-05, - "loss": 1.2868, - "step": 604 - }, - { - "epoch": 0.28, - "grad_norm": 0.318132609128952, - "learning_rate": 7.939698492462313e-05, - "loss": 1.2021, - "step": 605 - }, - { - "epoch": 0.28, - "grad_norm": 0.3179965913295746, - "learning_rate": 7.91959798994975e-05, - "loss": 1.2386, - "step": 606 - }, - { - "epoch": 0.28, - "grad_norm": 0.34012511372566223, - "learning_rate": 7.899497487437186e-05, - "loss": 1.2973, - "step": 607 - }, - { - "epoch": 0.28, - "grad_norm": 0.35141730308532715, - "learning_rate": 7.879396984924623e-05, - "loss": 1.2415, - "step": 608 - }, - { - "epoch": 0.28, - "grad_norm": 0.341964989900589, - "learning_rate": 7.85929648241206e-05, - "loss": 1.2097, - "step": 609 - }, - { - "epoch": 0.28, - "grad_norm": 0.34423911571502686, - "learning_rate": 7.839195979899498e-05, - "loss": 1.1406, - "step": 610 - }, - { - "epoch": 0.28, - "grad_norm": 0.36800771951675415, - "learning_rate": 7.819095477386935e-05, - "loss": 1.2266, - "step": 611 - }, - { - "epoch": 0.28, - "grad_norm": 0.31842777132987976, - "learning_rate": 7.798994974874372e-05, - "loss": 1.2199, - "step": 612 - }, - { - "epoch": 0.28, - "grad_norm": 0.3278830349445343, - "learning_rate": 7.77889447236181e-05, - "loss": 1.248, - "step": 613 - }, - { - "epoch": 0.28, - "grad_norm": 0.3215343654155731, - "learning_rate": 7.758793969849247e-05, - "loss": 1.317, - "step": 614 - }, - { - "epoch": 0.28, - "grad_norm": 0.3352849781513214, - "learning_rate": 7.738693467336684e-05, - "loss": 1.2171, - "step": 615 - }, - { - "epoch": 0.28, - "grad_norm": 0.34062597155570984, - "learning_rate": 7.71859296482412e-05, - "loss": 1.2303, - "step": 616 - }, - { - "epoch": 0.28, - "grad_norm": 0.35442209243774414, - "learning_rate": 7.698492462311559e-05, - "loss": 1.3004, - "step": 617 - }, - { - "epoch": 0.28, - "grad_norm": 0.3413764238357544, - "learning_rate": 7.678391959798995e-05, - "loss": 1.2718, - "step": 618 - }, - { - "epoch": 0.28, - "grad_norm": 0.34083688259124756, - "learning_rate": 7.658291457286433e-05, - "loss": 1.2607, - "step": 619 - }, - { - "epoch": 0.28, - "grad_norm": 0.3345371186733246, - "learning_rate": 7.638190954773869e-05, - "loss": 1.2217, - "step": 620 - }, - { - "epoch": 0.28, - "grad_norm": 0.34990695118904114, - "learning_rate": 7.618090452261307e-05, - "loss": 1.2614, - "step": 621 - }, - { - "epoch": 0.28, - "grad_norm": 0.3319753110408783, - "learning_rate": 7.597989949748744e-05, - "loss": 1.2573, - "step": 622 - }, - { - "epoch": 0.28, - "grad_norm": 0.3669947385787964, - "learning_rate": 7.577889447236181e-05, - "loss": 1.2714, - "step": 623 - }, - { - "epoch": 0.29, - "grad_norm": 0.34474653005599976, - "learning_rate": 7.557788944723618e-05, - "loss": 1.2419, - "step": 624 - }, - { - "epoch": 0.29, - "grad_norm": 0.35597702860832214, - "learning_rate": 7.537688442211056e-05, - "loss": 1.2893, - "step": 625 - }, - { - "epoch": 0.29, - "grad_norm": 0.3305228352546692, - "learning_rate": 7.517587939698493e-05, - "loss": 1.2201, - "step": 626 - }, - { - "epoch": 0.29, - "grad_norm": 0.35510796308517456, - "learning_rate": 7.49748743718593e-05, - "loss": 1.2193, - "step": 627 - }, - { - "epoch": 0.29, - "grad_norm": 0.33328601717948914, - "learning_rate": 7.477386934673368e-05, - "loss": 1.2986, - "step": 628 - }, - { - "epoch": 0.29, - "grad_norm": 0.3292202055454254, - "learning_rate": 7.457286432160805e-05, - "loss": 1.2407, - "step": 629 - }, - { - "epoch": 0.29, - "grad_norm": 0.3294682800769806, - "learning_rate": 7.437185929648241e-05, - "loss": 1.1331, - "step": 630 - }, - { - "epoch": 0.29, - "grad_norm": 0.33853623270988464, - "learning_rate": 7.417085427135678e-05, - "loss": 1.2286, - "step": 631 - }, - { - "epoch": 0.29, - "grad_norm": 0.3701392412185669, - "learning_rate": 7.396984924623115e-05, - "loss": 1.244, - "step": 632 - }, - { - "epoch": 0.29, - "grad_norm": 0.32087066769599915, - "learning_rate": 7.376884422110553e-05, - "loss": 1.1956, - "step": 633 - }, - { - "epoch": 0.29, - "grad_norm": 0.3330169916152954, - "learning_rate": 7.35678391959799e-05, - "loss": 1.3153, - "step": 634 - }, - { - "epoch": 0.29, - "grad_norm": 0.3507911264896393, - "learning_rate": 7.336683417085427e-05, - "loss": 1.2131, - "step": 635 - }, - { - "epoch": 0.29, - "grad_norm": 0.34372231364250183, - "learning_rate": 7.316582914572865e-05, - "loss": 1.1937, - "step": 636 - }, - { - "epoch": 0.29, - "grad_norm": 0.33789506554603577, - "learning_rate": 7.296482412060302e-05, - "loss": 1.2465, - "step": 637 - }, - { - "epoch": 0.29, - "grad_norm": 0.33567336201667786, - "learning_rate": 7.276381909547739e-05, - "loss": 1.2036, - "step": 638 - }, - { - "epoch": 0.29, - "grad_norm": 0.3606951832771301, - "learning_rate": 7.256281407035177e-05, - "loss": 1.3034, - "step": 639 - }, - { - "epoch": 0.29, - "grad_norm": 0.3508096933364868, - "learning_rate": 7.236180904522614e-05, - "loss": 1.2744, - "step": 640 - }, - { - "epoch": 0.29, - "grad_norm": 0.3432866930961609, - "learning_rate": 7.21608040201005e-05, - "loss": 1.2345, - "step": 641 - }, - { - "epoch": 0.29, - "grad_norm": 0.3186059892177582, - "learning_rate": 7.195979899497488e-05, - "loss": 1.2293, - "step": 642 - }, - { - "epoch": 0.29, - "grad_norm": 0.3387812077999115, - "learning_rate": 7.175879396984924e-05, - "loss": 1.2352, - "step": 643 - }, - { - "epoch": 0.29, - "grad_norm": 0.35298967361450195, - "learning_rate": 7.155778894472363e-05, - "loss": 1.3116, - "step": 644 - }, - { - "epoch": 0.29, - "grad_norm": 0.34189942479133606, - "learning_rate": 7.135678391959799e-05, - "loss": 1.2437, - "step": 645 - }, - { - "epoch": 0.3, - "grad_norm": 0.32053112983703613, - "learning_rate": 7.115577889447236e-05, - "loss": 1.1889, - "step": 646 - }, - { - "epoch": 0.3, - "grad_norm": 0.3480307161808014, - "learning_rate": 7.095477386934674e-05, - "loss": 1.2233, - "step": 647 - }, - { - "epoch": 0.3, - "grad_norm": 0.3634546399116516, - "learning_rate": 7.075376884422111e-05, - "loss": 1.2502, - "step": 648 - }, - { - "epoch": 0.3, - "grad_norm": 0.310649037361145, - "learning_rate": 7.055276381909548e-05, - "loss": 1.2297, - "step": 649 - }, - { - "epoch": 0.3, - "grad_norm": 0.3128393292427063, - "learning_rate": 7.035175879396985e-05, - "loss": 1.2366, - "step": 650 - }, - { - "epoch": 0.3, - "grad_norm": 0.32398995757102966, - "learning_rate": 7.015075376884423e-05, - "loss": 1.255, - "step": 651 - }, - { - "epoch": 0.3, - "grad_norm": 0.3405938148498535, - "learning_rate": 6.99497487437186e-05, - "loss": 1.1838, - "step": 652 - }, - { - "epoch": 0.3, - "grad_norm": 0.3226099908351898, - "learning_rate": 6.974874371859297e-05, - "loss": 1.2348, - "step": 653 - }, - { - "epoch": 0.3, - "grad_norm": 0.3316473364830017, - "learning_rate": 6.954773869346733e-05, - "loss": 1.197, - "step": 654 - }, - { - "epoch": 0.3, - "grad_norm": 0.34110310673713684, - "learning_rate": 6.93467336683417e-05, - "loss": 1.273, - "step": 655 - }, - { - "epoch": 0.3, - "grad_norm": 0.34403571486473083, - "learning_rate": 6.914572864321608e-05, - "loss": 1.3238, - "step": 656 - }, - { - "epoch": 0.3, - "grad_norm": 0.3206476867198944, - "learning_rate": 6.894472361809045e-05, - "loss": 1.263, - "step": 657 - }, - { - "epoch": 0.3, - "grad_norm": 0.3400155007839203, - "learning_rate": 6.874371859296482e-05, - "loss": 1.2727, - "step": 658 - }, - { - "epoch": 0.3, - "grad_norm": 0.33599746227264404, - "learning_rate": 6.85427135678392e-05, - "loss": 1.2196, - "step": 659 - }, - { - "epoch": 0.3, - "grad_norm": 0.32535916566848755, - "learning_rate": 6.834170854271357e-05, - "loss": 1.2313, - "step": 660 - }, - { - "epoch": 0.3, - "grad_norm": 0.33513665199279785, - "learning_rate": 6.814070351758794e-05, - "loss": 1.2353, - "step": 661 - }, - { - "epoch": 0.3, - "grad_norm": 0.34317225217819214, - "learning_rate": 6.793969849246232e-05, - "loss": 1.2691, - "step": 662 - }, - { - "epoch": 0.3, - "grad_norm": 0.32187891006469727, - "learning_rate": 6.773869346733669e-05, - "loss": 1.1432, - "step": 663 - }, - { - "epoch": 0.3, - "grad_norm": 0.37006068229675293, - "learning_rate": 6.753768844221105e-05, - "loss": 1.2977, - "step": 664 - }, - { - "epoch": 0.3, - "grad_norm": 0.3436387777328491, - "learning_rate": 6.733668341708544e-05, - "loss": 1.1242, - "step": 665 - }, - { - "epoch": 0.3, - "grad_norm": 0.35817015171051025, - "learning_rate": 6.71356783919598e-05, - "loss": 1.2531, - "step": 666 - }, - { - "epoch": 0.3, - "grad_norm": 0.3394136130809784, - "learning_rate": 6.693467336683418e-05, - "loss": 1.2435, - "step": 667 - }, - { - "epoch": 0.31, - "grad_norm": 0.35513705015182495, - "learning_rate": 6.673366834170854e-05, - "loss": 1.1198, - "step": 668 - }, - { - "epoch": 0.31, - "grad_norm": 0.32795122265815735, - "learning_rate": 6.653266331658293e-05, - "loss": 1.1548, - "step": 669 - }, - { - "epoch": 0.31, - "grad_norm": 0.3325214982032776, - "learning_rate": 6.633165829145729e-05, - "loss": 1.1883, - "step": 670 - }, - { - "epoch": 0.31, - "grad_norm": 0.3546355962753296, - "learning_rate": 6.613065326633166e-05, - "loss": 1.2405, - "step": 671 - }, - { - "epoch": 0.31, - "grad_norm": 0.3289410471916199, - "learning_rate": 6.592964824120603e-05, - "loss": 1.099, - "step": 672 - }, - { - "epoch": 0.31, - "grad_norm": 0.3165920674800873, - "learning_rate": 6.57286432160804e-05, - "loss": 1.212, - "step": 673 - }, - { - "epoch": 0.31, - "grad_norm": 0.3438250422477722, - "learning_rate": 6.552763819095478e-05, - "loss": 1.2021, - "step": 674 - }, - { - "epoch": 0.31, - "grad_norm": 0.3379438817501068, - "learning_rate": 6.532663316582915e-05, - "loss": 1.1801, - "step": 675 - }, - { - "epoch": 0.31, - "grad_norm": 0.3609768748283386, - "learning_rate": 6.512562814070352e-05, - "loss": 1.3406, - "step": 676 - }, - { - "epoch": 0.31, - "grad_norm": 0.3512379229068756, - "learning_rate": 6.492462311557788e-05, - "loss": 1.2982, - "step": 677 - }, - { - "epoch": 0.31, - "grad_norm": 0.3456796407699585, - "learning_rate": 6.472361809045227e-05, - "loss": 1.1908, - "step": 678 - }, - { - "epoch": 0.31, - "grad_norm": 0.3362092971801758, - "learning_rate": 6.452261306532663e-05, - "loss": 1.3286, - "step": 679 - }, - { - "epoch": 0.31, - "grad_norm": 0.3278902769088745, - "learning_rate": 6.4321608040201e-05, - "loss": 1.1441, - "step": 680 - }, - { - "epoch": 0.31, - "grad_norm": 0.350449800491333, - "learning_rate": 6.412060301507538e-05, - "loss": 1.2131, - "step": 681 - }, - { - "epoch": 0.31, - "grad_norm": 0.3359929323196411, - "learning_rate": 6.391959798994975e-05, - "loss": 1.285, - "step": 682 - }, - { - "epoch": 0.31, - "grad_norm": 0.3683655261993408, - "learning_rate": 6.371859296482412e-05, - "loss": 1.2161, - "step": 683 - }, - { - "epoch": 0.31, - "grad_norm": 0.3192093074321747, - "learning_rate": 6.35175879396985e-05, - "loss": 1.1578, - "step": 684 - }, - { - "epoch": 0.31, - "grad_norm": 0.3408317565917969, - "learning_rate": 6.331658291457287e-05, - "loss": 1.1936, - "step": 685 - }, - { - "epoch": 0.31, - "grad_norm": 0.3410519063472748, - "learning_rate": 6.311557788944724e-05, - "loss": 1.2473, - "step": 686 - }, - { - "epoch": 0.31, - "grad_norm": 0.3447181284427643, - "learning_rate": 6.291457286432161e-05, - "loss": 1.247, - "step": 687 - }, - { - "epoch": 0.31, - "grad_norm": 0.32790473103523254, - "learning_rate": 6.271356783919599e-05, - "loss": 1.2803, - "step": 688 - }, - { - "epoch": 0.31, - "grad_norm": 0.34671932458877563, - "learning_rate": 6.251256281407035e-05, - "loss": 1.2403, - "step": 689 - }, - { - "epoch": 0.32, - "grad_norm": 0.34242817759513855, - "learning_rate": 6.231155778894473e-05, - "loss": 1.2525, - "step": 690 - }, - { - "epoch": 0.32, - "grad_norm": 0.33679676055908203, - "learning_rate": 6.211055276381909e-05, - "loss": 1.2433, - "step": 691 - }, - { - "epoch": 0.32, - "grad_norm": 0.35853180289268494, - "learning_rate": 6.190954773869348e-05, - "loss": 1.2014, - "step": 692 - }, - { - "epoch": 0.32, - "grad_norm": 0.3501017391681671, - "learning_rate": 6.170854271356784e-05, - "loss": 1.1814, - "step": 693 - }, - { - "epoch": 0.32, - "grad_norm": 0.3347374200820923, - "learning_rate": 6.150753768844222e-05, - "loss": 1.2137, - "step": 694 - }, - { - "epoch": 0.32, - "grad_norm": 0.33397722244262695, - "learning_rate": 6.130653266331658e-05, - "loss": 1.1753, - "step": 695 - }, - { - "epoch": 0.32, - "grad_norm": 0.33950281143188477, - "learning_rate": 6.110552763819096e-05, - "loss": 1.2455, - "step": 696 - }, - { - "epoch": 0.32, - "grad_norm": 0.3363599479198456, - "learning_rate": 6.090452261306533e-05, - "loss": 1.2087, - "step": 697 - }, - { - "epoch": 0.32, - "grad_norm": 0.3455164134502411, - "learning_rate": 6.070351758793971e-05, - "loss": 1.2711, - "step": 698 - }, - { - "epoch": 0.32, - "grad_norm": 0.3824455440044403, - "learning_rate": 6.0502512562814076e-05, - "loss": 1.2345, - "step": 699 - }, - { - "epoch": 0.32, - "grad_norm": 0.3414604961872101, - "learning_rate": 6.030150753768844e-05, - "loss": 1.2464, - "step": 700 - }, - { - "epoch": 0.32, - "grad_norm": 0.3410933017730713, - "learning_rate": 6.0100502512562815e-05, - "loss": 1.2505, - "step": 701 - }, - { - "epoch": 0.32, - "grad_norm": 0.3278619945049286, - "learning_rate": 5.989949748743718e-05, - "loss": 1.2373, - "step": 702 - }, - { - "epoch": 0.32, - "grad_norm": 0.34170377254486084, - "learning_rate": 5.969849246231156e-05, - "loss": 1.2528, - "step": 703 - }, - { - "epoch": 0.32, - "grad_norm": 0.3370874226093292, - "learning_rate": 5.949748743718593e-05, - "loss": 1.2484, - "step": 704 - }, - { - "epoch": 0.32, - "grad_norm": 0.3313332796096802, - "learning_rate": 5.929648241206031e-05, - "loss": 1.2382, - "step": 705 - }, - { - "epoch": 0.32, - "grad_norm": 0.3433292508125305, - "learning_rate": 5.909547738693467e-05, - "loss": 1.2213, - "step": 706 - }, - { - "epoch": 0.32, - "grad_norm": 0.3642396926879883, - "learning_rate": 5.889447236180905e-05, - "loss": 1.1992, - "step": 707 - }, - { - "epoch": 0.32, - "grad_norm": 0.33468329906463623, - "learning_rate": 5.869346733668342e-05, - "loss": 1.2648, - "step": 708 - }, - { - "epoch": 0.32, - "grad_norm": 0.34776145219802856, - "learning_rate": 5.849246231155779e-05, - "loss": 1.2781, - "step": 709 - }, - { - "epoch": 0.32, - "grad_norm": 0.34735772013664246, - "learning_rate": 5.829145728643216e-05, - "loss": 1.3148, - "step": 710 - }, - { - "epoch": 0.33, - "grad_norm": 0.30450117588043213, - "learning_rate": 5.809045226130654e-05, - "loss": 1.2433, - "step": 711 - }, - { - "epoch": 0.33, - "grad_norm": 0.34202051162719727, - "learning_rate": 5.7889447236180904e-05, - "loss": 1.3233, - "step": 712 - }, - { - "epoch": 0.33, - "grad_norm": 0.3289657235145569, - "learning_rate": 5.7688442211055284e-05, - "loss": 1.1637, - "step": 713 - }, - { - "epoch": 0.33, - "grad_norm": 0.3224788308143616, - "learning_rate": 5.748743718592965e-05, - "loss": 1.2004, - "step": 714 - }, - { - "epoch": 0.33, - "grad_norm": 0.34642502665519714, - "learning_rate": 5.728643216080403e-05, - "loss": 1.2703, - "step": 715 - }, - { - "epoch": 0.33, - "grad_norm": 0.3221339285373688, - "learning_rate": 5.7085427135678396e-05, - "loss": 1.2686, - "step": 716 - }, - { - "epoch": 0.33, - "grad_norm": 0.3153306245803833, - "learning_rate": 5.688442211055277e-05, - "loss": 1.1924, - "step": 717 - }, - { - "epoch": 0.33, - "grad_norm": 0.3397902846336365, - "learning_rate": 5.6683417085427135e-05, - "loss": 1.2925, - "step": 718 - }, - { - "epoch": 0.33, - "grad_norm": 0.31026211380958557, - "learning_rate": 5.6482412060301515e-05, - "loss": 1.215, - "step": 719 - }, - { - "epoch": 0.33, - "grad_norm": 0.36266523599624634, - "learning_rate": 5.628140703517588e-05, - "loss": 1.1878, - "step": 720 - }, - { - "epoch": 0.33, - "grad_norm": 0.3195384740829468, - "learning_rate": 5.608040201005026e-05, - "loss": 1.2683, - "step": 721 - }, - { - "epoch": 0.33, - "grad_norm": 0.3288376033306122, - "learning_rate": 5.587939698492463e-05, - "loss": 1.3016, - "step": 722 - }, - { - "epoch": 0.33, - "grad_norm": 0.3408261835575104, - "learning_rate": 5.567839195979899e-05, - "loss": 1.2068, - "step": 723 - }, - { - "epoch": 0.33, - "grad_norm": 0.35029324889183044, - "learning_rate": 5.547738693467337e-05, - "loss": 1.1991, - "step": 724 - }, - { - "epoch": 0.33, - "grad_norm": 0.3637566566467285, - "learning_rate": 5.527638190954774e-05, - "loss": 1.2342, - "step": 725 - }, - { - "epoch": 0.33, - "grad_norm": 0.3418453633785248, - "learning_rate": 5.507537688442211e-05, - "loss": 1.3175, - "step": 726 - }, - { - "epoch": 0.33, - "grad_norm": 0.35708463191986084, - "learning_rate": 5.487437185929648e-05, - "loss": 1.2073, - "step": 727 - }, - { - "epoch": 0.33, - "grad_norm": 0.37137219309806824, - "learning_rate": 5.467336683417086e-05, - "loss": 1.2347, - "step": 728 - }, - { - "epoch": 0.33, - "grad_norm": 0.3417325019836426, - "learning_rate": 5.4472361809045224e-05, - "loss": 1.1796, - "step": 729 - }, - { - "epoch": 0.33, - "grad_norm": 0.3448787331581116, - "learning_rate": 5.4271356783919604e-05, - "loss": 1.2761, - "step": 730 - }, - { - "epoch": 0.33, - "grad_norm": 0.3505607843399048, - "learning_rate": 5.407035175879397e-05, - "loss": 1.2748, - "step": 731 - }, - { - "epoch": 0.33, - "grad_norm": 0.3447292149066925, - "learning_rate": 5.386934673366835e-05, - "loss": 1.2081, - "step": 732 - }, - { - "epoch": 0.34, - "grad_norm": 0.36144357919692993, - "learning_rate": 5.3668341708542716e-05, - "loss": 1.2111, - "step": 733 - }, - { - "epoch": 0.34, - "grad_norm": 0.34409165382385254, - "learning_rate": 5.346733668341709e-05, - "loss": 1.232, - "step": 734 - }, - { - "epoch": 0.34, - "grad_norm": 0.34955504536628723, - "learning_rate": 5.3266331658291455e-05, - "loss": 1.2325, - "step": 735 - }, - { - "epoch": 0.34, - "grad_norm": 0.38873958587646484, - "learning_rate": 5.3065326633165835e-05, - "loss": 1.253, - "step": 736 - }, - { - "epoch": 0.34, - "grad_norm": 0.3274456560611725, - "learning_rate": 5.28643216080402e-05, - "loss": 1.2484, - "step": 737 - }, - { - "epoch": 0.34, - "grad_norm": 0.35621777176856995, - "learning_rate": 5.266331658291458e-05, - "loss": 1.293, - "step": 738 - }, - { - "epoch": 0.34, - "grad_norm": 0.33436283469200134, - "learning_rate": 5.246231155778895e-05, - "loss": 1.2933, - "step": 739 - }, - { - "epoch": 0.34, - "grad_norm": 0.35243648290634155, - "learning_rate": 5.226130653266332e-05, - "loss": 1.2969, - "step": 740 - }, - { - "epoch": 0.34, - "grad_norm": 0.34483468532562256, - "learning_rate": 5.206030150753769e-05, - "loss": 1.2491, - "step": 741 - }, - { - "epoch": 0.34, - "grad_norm": 0.32675257325172424, - "learning_rate": 5.1859296482412066e-05, - "loss": 1.2094, - "step": 742 - }, - { - "epoch": 0.34, - "grad_norm": 0.34875720739364624, - "learning_rate": 5.165829145728643e-05, - "loss": 1.2774, - "step": 743 - }, - { - "epoch": 0.34, - "grad_norm": 0.34670987725257874, - "learning_rate": 5.145728643216081e-05, - "loss": 1.2014, - "step": 744 - }, - { - "epoch": 0.34, - "grad_norm": 0.34661754965782166, - "learning_rate": 5.125628140703518e-05, - "loss": 1.3128, - "step": 745 - }, - { - "epoch": 0.34, - "grad_norm": 0.3611772358417511, - "learning_rate": 5.1055276381909544e-05, - "loss": 1.1779, - "step": 746 - }, - { - "epoch": 0.34, - "grad_norm": 0.3536320626735687, - "learning_rate": 5.0854271356783924e-05, - "loss": 1.2185, - "step": 747 - }, - { - "epoch": 0.34, - "grad_norm": 0.3396211564540863, - "learning_rate": 5.065326633165829e-05, - "loss": 1.2267, - "step": 748 - }, - { - "epoch": 0.34, - "grad_norm": 0.3299081325531006, - "learning_rate": 5.045226130653266e-05, - "loss": 1.234, - "step": 749 - }, - { - "epoch": 0.34, - "grad_norm": 0.34536656737327576, - "learning_rate": 5.0251256281407036e-05, - "loss": 1.1687, - "step": 750 - }, - { - "epoch": 0.34, - "grad_norm": 0.3260354697704315, - "learning_rate": 5.005025125628141e-05, - "loss": 1.2668, - "step": 751 - }, - { - "epoch": 0.34, - "grad_norm": 0.3403102457523346, - "learning_rate": 4.984924623115578e-05, - "loss": 1.2686, - "step": 752 - }, - { - "epoch": 0.34, - "grad_norm": 0.3585387170314789, - "learning_rate": 4.9648241206030155e-05, - "loss": 1.2281, - "step": 753 - }, - { - "epoch": 0.34, - "grad_norm": 0.3247324824333191, - "learning_rate": 4.944723618090453e-05, - "loss": 1.2756, - "step": 754 - }, - { - "epoch": 0.35, - "grad_norm": 0.36049649119377136, - "learning_rate": 4.92462311557789e-05, - "loss": 1.3494, - "step": 755 - }, - { - "epoch": 0.35, - "grad_norm": 0.32784217596054077, - "learning_rate": 4.9045226130653274e-05, - "loss": 1.2805, - "step": 756 - }, - { - "epoch": 0.35, - "grad_norm": 0.33943304419517517, - "learning_rate": 4.884422110552764e-05, - "loss": 1.2258, - "step": 757 - }, - { - "epoch": 0.35, - "grad_norm": 0.3395639657974243, - "learning_rate": 4.864321608040201e-05, - "loss": 1.1176, - "step": 758 - }, - { - "epoch": 0.35, - "grad_norm": 0.32322150468826294, - "learning_rate": 4.844221105527638e-05, - "loss": 1.1269, - "step": 759 - }, - { - "epoch": 0.35, - "grad_norm": 0.32815924286842346, - "learning_rate": 4.824120603015075e-05, - "loss": 1.334, - "step": 760 - }, - { - "epoch": 0.35, - "grad_norm": 0.33146142959594727, - "learning_rate": 4.8040201005025125e-05, - "loss": 1.1741, - "step": 761 - }, - { - "epoch": 0.35, - "grad_norm": 0.3309881389141083, - "learning_rate": 4.78391959798995e-05, - "loss": 1.2565, - "step": 762 - }, - { - "epoch": 0.35, - "grad_norm": 0.36289462447166443, - "learning_rate": 4.763819095477387e-05, - "loss": 1.1769, - "step": 763 - }, - { - "epoch": 0.35, - "grad_norm": 0.31728067994117737, - "learning_rate": 4.7437185929648244e-05, - "loss": 1.1964, - "step": 764 - }, - { - "epoch": 0.35, - "grad_norm": 0.3471260964870453, - "learning_rate": 4.723618090452262e-05, - "loss": 1.2827, - "step": 765 - }, - { - "epoch": 0.35, - "grad_norm": 0.3100842535495758, - "learning_rate": 4.703517587939698e-05, - "loss": 1.1399, - "step": 766 - }, - { - "epoch": 0.35, - "grad_norm": 0.32581594586372375, - "learning_rate": 4.6834170854271356e-05, - "loss": 1.2041, - "step": 767 - }, - { - "epoch": 0.35, - "grad_norm": 0.33643782138824463, - "learning_rate": 4.663316582914573e-05, - "loss": 1.2493, - "step": 768 - }, - { - "epoch": 0.35, - "grad_norm": 0.34091752767562866, - "learning_rate": 4.64321608040201e-05, - "loss": 1.2265, - "step": 769 - }, - { - "epoch": 0.35, - "grad_norm": 0.3271618187427521, - "learning_rate": 4.6231155778894475e-05, - "loss": 1.183, - "step": 770 - }, - { - "epoch": 0.35, - "grad_norm": 0.3240033686161041, - "learning_rate": 4.603015075376885e-05, - "loss": 1.2065, - "step": 771 - }, - { - "epoch": 0.35, - "grad_norm": 0.3427727520465851, - "learning_rate": 4.582914572864322e-05, - "loss": 1.1634, - "step": 772 - }, - { - "epoch": 0.35, - "grad_norm": 0.31667765974998474, - "learning_rate": 4.5628140703517594e-05, - "loss": 1.2091, - "step": 773 - }, - { - "epoch": 0.35, - "grad_norm": 0.326895534992218, - "learning_rate": 4.542713567839196e-05, - "loss": 1.2075, - "step": 774 - }, - { - "epoch": 0.35, - "grad_norm": 0.32184022665023804, - "learning_rate": 4.522613065326633e-05, - "loss": 1.206, - "step": 775 - }, - { - "epoch": 0.35, - "grad_norm": 0.3442372679710388, - "learning_rate": 4.5025125628140706e-05, - "loss": 1.2528, - "step": 776 - }, - { - "epoch": 0.36, - "grad_norm": 0.3295699656009674, - "learning_rate": 4.482412060301508e-05, - "loss": 1.2723, - "step": 777 - }, - { - "epoch": 0.36, - "grad_norm": 0.33833664655685425, - "learning_rate": 4.462311557788945e-05, - "loss": 1.2277, - "step": 778 - }, - { - "epoch": 0.36, - "grad_norm": 0.33365553617477417, - "learning_rate": 4.4422110552763825e-05, - "loss": 1.2101, - "step": 779 - }, - { - "epoch": 0.36, - "grad_norm": 0.340986967086792, - "learning_rate": 4.42211055276382e-05, - "loss": 1.2342, - "step": 780 - }, - { - "epoch": 0.36, - "grad_norm": 0.3409508764743805, - "learning_rate": 4.4020100502512564e-05, - "loss": 1.2387, - "step": 781 - }, - { - "epoch": 0.36, - "grad_norm": 0.33809345960617065, - "learning_rate": 4.381909547738694e-05, - "loss": 1.2864, - "step": 782 - }, - { - "epoch": 0.36, - "grad_norm": 0.34557396173477173, - "learning_rate": 4.3618090452261303e-05, - "loss": 1.1534, - "step": 783 - }, - { - "epoch": 0.36, - "grad_norm": 0.35186269879341125, - "learning_rate": 4.3417085427135676e-05, - "loss": 1.2989, - "step": 784 - }, - { - "epoch": 0.36, - "grad_norm": 0.3429466187953949, - "learning_rate": 4.321608040201005e-05, - "loss": 1.2207, - "step": 785 - }, - { - "epoch": 0.36, - "grad_norm": 0.3693259060382843, - "learning_rate": 4.301507537688442e-05, - "loss": 1.2331, - "step": 786 - }, - { - "epoch": 0.36, - "grad_norm": 0.320137083530426, - "learning_rate": 4.2814070351758795e-05, - "loss": 1.2518, - "step": 787 - }, - { - "epoch": 0.36, - "grad_norm": 0.34945398569107056, - "learning_rate": 4.261306532663317e-05, - "loss": 1.2906, - "step": 788 - }, - { - "epoch": 0.36, - "grad_norm": 0.32532766461372375, - "learning_rate": 4.241206030150754e-05, - "loss": 1.1813, - "step": 789 - }, - { - "epoch": 0.36, - "grad_norm": 0.34059426188468933, - "learning_rate": 4.2211055276381914e-05, - "loss": 1.2145, - "step": 790 - }, - { - "epoch": 0.36, - "grad_norm": 0.3278336226940155, - "learning_rate": 4.201005025125628e-05, - "loss": 1.2438, - "step": 791 - }, - { - "epoch": 0.36, - "grad_norm": 0.3421019911766052, - "learning_rate": 4.180904522613065e-05, - "loss": 1.0761, - "step": 792 - }, - { - "epoch": 0.36, - "grad_norm": 0.34605157375335693, - "learning_rate": 4.1608040201005026e-05, - "loss": 1.2483, - "step": 793 - }, - { - "epoch": 0.36, - "grad_norm": 0.36613568663597107, - "learning_rate": 4.14070351758794e-05, - "loss": 1.313, - "step": 794 - }, - { - "epoch": 0.36, - "grad_norm": 0.33103081583976746, - "learning_rate": 4.120603015075377e-05, - "loss": 1.2694, - "step": 795 - }, - { - "epoch": 0.36, - "grad_norm": 0.3223966956138611, - "learning_rate": 4.1005025125628145e-05, - "loss": 1.3503, - "step": 796 - }, - { - "epoch": 0.36, - "grad_norm": 0.3374450206756592, - "learning_rate": 4.080402010050252e-05, - "loss": 1.1971, - "step": 797 - }, - { - "epoch": 0.36, - "grad_norm": 0.33531078696250916, - "learning_rate": 4.060301507537689e-05, - "loss": 1.129, - "step": 798 - }, - { - "epoch": 0.37, - "grad_norm": 0.32305672764778137, - "learning_rate": 4.040201005025126e-05, - "loss": 1.2788, - "step": 799 - }, - { - "epoch": 0.37, - "grad_norm": 0.3243289887905121, - "learning_rate": 4.020100502512563e-05, - "loss": 1.2627, - "step": 800 - }, - { - "epoch": 0.37, - "grad_norm": 0.34850698709487915, - "learning_rate": 4e-05, - "loss": 1.2736, - "step": 801 - }, - { - "epoch": 0.37, - "grad_norm": 0.32808718085289, - "learning_rate": 3.9798994974874376e-05, - "loss": 1.1288, - "step": 802 - }, - { - "epoch": 0.37, - "grad_norm": 0.32883942127227783, - "learning_rate": 3.959798994974875e-05, - "loss": 1.2443, - "step": 803 - }, - { - "epoch": 0.37, - "grad_norm": 0.31969812512397766, - "learning_rate": 3.9396984924623115e-05, - "loss": 1.1762, - "step": 804 - }, - { - "epoch": 0.37, - "grad_norm": 0.339046835899353, - "learning_rate": 3.919597989949749e-05, - "loss": 1.2671, - "step": 805 - }, - { - "epoch": 0.37, - "grad_norm": 0.33851614594459534, - "learning_rate": 3.899497487437186e-05, - "loss": 1.2257, - "step": 806 - }, - { - "epoch": 0.37, - "grad_norm": 0.3604491055011749, - "learning_rate": 3.8793969849246234e-05, - "loss": 1.1635, - "step": 807 - }, - { - "epoch": 0.37, - "grad_norm": 0.3570076823234558, - "learning_rate": 3.85929648241206e-05, - "loss": 1.2682, - "step": 808 - }, - { - "epoch": 0.37, - "grad_norm": 0.32678258419036865, - "learning_rate": 3.8391959798994973e-05, - "loss": 1.2212, - "step": 809 - }, - { - "epoch": 0.37, - "grad_norm": 0.2903866767883301, - "learning_rate": 3.8190954773869346e-05, - "loss": 1.115, - "step": 810 - }, - { - "epoch": 0.37, - "grad_norm": 0.33241915702819824, - "learning_rate": 3.798994974874372e-05, - "loss": 1.2963, - "step": 811 - }, - { - "epoch": 0.37, - "grad_norm": 0.36244097352027893, - "learning_rate": 3.778894472361809e-05, - "loss": 1.346, - "step": 812 - }, - { - "epoch": 0.37, - "grad_norm": 0.3391794264316559, - "learning_rate": 3.7587939698492465e-05, - "loss": 1.2304, - "step": 813 - }, - { - "epoch": 0.37, - "grad_norm": 0.3356548249721527, - "learning_rate": 3.738693467336684e-05, - "loss": 1.2342, - "step": 814 - }, - { - "epoch": 0.37, - "grad_norm": 0.33378085494041443, - "learning_rate": 3.7185929648241204e-05, - "loss": 1.2352, - "step": 815 - }, - { - "epoch": 0.37, - "grad_norm": 0.35757875442504883, - "learning_rate": 3.698492462311558e-05, - "loss": 1.2991, - "step": 816 - }, - { - "epoch": 0.37, - "grad_norm": 0.30651041865348816, - "learning_rate": 3.678391959798995e-05, - "loss": 1.2426, - "step": 817 - }, - { - "epoch": 0.37, - "grad_norm": 0.34417861700057983, - "learning_rate": 3.658291457286432e-05, - "loss": 1.3361, - "step": 818 - }, - { - "epoch": 0.37, - "grad_norm": 0.31958290934562683, - "learning_rate": 3.6381909547738696e-05, - "loss": 1.2257, - "step": 819 - }, - { - "epoch": 0.37, - "grad_norm": 0.35524600744247437, - "learning_rate": 3.618090452261307e-05, - "loss": 1.3071, - "step": 820 - }, - { - "epoch": 0.38, - "grad_norm": 0.3114381432533264, - "learning_rate": 3.597989949748744e-05, - "loss": 1.1885, - "step": 821 - }, - { - "epoch": 0.38, - "grad_norm": 0.3464575409889221, - "learning_rate": 3.5778894472361815e-05, - "loss": 1.2519, - "step": 822 - }, - { - "epoch": 0.38, - "grad_norm": 0.3141149878501892, - "learning_rate": 3.557788944723618e-05, - "loss": 1.2193, - "step": 823 - }, - { - "epoch": 0.38, - "grad_norm": 0.3367476463317871, - "learning_rate": 3.5376884422110554e-05, - "loss": 1.2816, - "step": 824 - }, - { - "epoch": 0.38, - "grad_norm": 0.34699156880378723, - "learning_rate": 3.517587939698493e-05, - "loss": 1.3171, - "step": 825 - }, - { - "epoch": 0.38, - "grad_norm": 0.3360113501548767, - "learning_rate": 3.49748743718593e-05, - "loss": 1.1428, - "step": 826 - }, - { - "epoch": 0.38, - "grad_norm": 0.35045570135116577, - "learning_rate": 3.4773869346733667e-05, - "loss": 1.2928, - "step": 827 - }, - { - "epoch": 0.38, - "grad_norm": 0.3526917099952698, - "learning_rate": 3.457286432160804e-05, - "loss": 1.1539, - "step": 828 - }, - { - "epoch": 0.38, - "grad_norm": 0.3474697768688202, - "learning_rate": 3.437185929648241e-05, - "loss": 1.2148, - "step": 829 - }, - { - "epoch": 0.38, - "grad_norm": 0.410773366689682, - "learning_rate": 3.4170854271356785e-05, - "loss": 1.1966, - "step": 830 - }, - { - "epoch": 0.38, - "grad_norm": 0.33725374937057495, - "learning_rate": 3.396984924623116e-05, - "loss": 1.2784, - "step": 831 - }, - { - "epoch": 0.38, - "grad_norm": 0.3344865143299103, - "learning_rate": 3.3768844221105525e-05, - "loss": 1.1797, - "step": 832 - }, - { - "epoch": 0.38, - "grad_norm": 0.3316870331764221, - "learning_rate": 3.35678391959799e-05, - "loss": 1.2463, - "step": 833 - }, - { - "epoch": 0.38, - "grad_norm": 0.3409169912338257, - "learning_rate": 3.336683417085427e-05, - "loss": 1.4216, - "step": 834 - }, - { - "epoch": 0.38, - "grad_norm": 0.34283575415611267, - "learning_rate": 3.3165829145728643e-05, - "loss": 1.2293, - "step": 835 - }, - { - "epoch": 0.38, - "grad_norm": 0.3508949279785156, - "learning_rate": 3.2964824120603016e-05, - "loss": 1.3677, - "step": 836 - }, - { - "epoch": 0.38, - "grad_norm": 0.34627601504325867, - "learning_rate": 3.276381909547739e-05, - "loss": 1.1889, - "step": 837 - }, - { - "epoch": 0.38, - "grad_norm": 0.3398142457008362, - "learning_rate": 3.256281407035176e-05, - "loss": 1.1853, - "step": 838 - }, - { - "epoch": 0.38, - "grad_norm": 0.34818077087402344, - "learning_rate": 3.2361809045226135e-05, - "loss": 1.2272, - "step": 839 - }, - { - "epoch": 0.38, - "grad_norm": 0.34182122349739075, - "learning_rate": 3.21608040201005e-05, - "loss": 1.236, - "step": 840 - }, - { - "epoch": 0.38, - "grad_norm": 0.33442428708076477, - "learning_rate": 3.1959798994974875e-05, - "loss": 1.2327, - "step": 841 - }, - { - "epoch": 0.38, - "grad_norm": 0.3499019742012024, - "learning_rate": 3.175879396984925e-05, - "loss": 1.2771, - "step": 842 - }, - { - "epoch": 0.39, - "grad_norm": 0.3252609074115753, - "learning_rate": 3.155778894472362e-05, - "loss": 1.2988, - "step": 843 - }, - { - "epoch": 0.39, - "grad_norm": 0.3448849022388458, - "learning_rate": 3.1356783919597993e-05, - "loss": 1.1551, - "step": 844 - }, - { - "epoch": 0.39, - "grad_norm": 0.34106680750846863, - "learning_rate": 3.1155778894472366e-05, - "loss": 1.2141, - "step": 845 - }, - { - "epoch": 0.39, - "grad_norm": 0.3367229104042053, - "learning_rate": 3.095477386934674e-05, - "loss": 1.2251, - "step": 846 - }, - { - "epoch": 0.39, - "grad_norm": 0.341509073972702, - "learning_rate": 3.075376884422111e-05, - "loss": 1.2391, - "step": 847 - }, - { - "epoch": 0.39, - "grad_norm": 0.3711039125919342, - "learning_rate": 3.055276381909548e-05, - "loss": 1.2643, - "step": 848 - }, - { - "epoch": 0.39, - "grad_norm": 0.36982688307762146, - "learning_rate": 3.0351758793969855e-05, - "loss": 1.2131, - "step": 849 - }, - { - "epoch": 0.39, - "grad_norm": 0.3039700984954834, - "learning_rate": 3.015075376884422e-05, - "loss": 1.1973, - "step": 850 - }, - { - "epoch": 0.39, - "grad_norm": 0.33018019795417786, - "learning_rate": 2.994974874371859e-05, - "loss": 1.2517, - "step": 851 - }, - { - "epoch": 0.39, - "grad_norm": 0.33908405900001526, - "learning_rate": 2.9748743718592964e-05, - "loss": 1.1586, - "step": 852 - }, - { - "epoch": 0.39, - "grad_norm": 0.34731873869895935, - "learning_rate": 2.9547738693467337e-05, - "loss": 1.2373, - "step": 853 - }, - { - "epoch": 0.39, - "grad_norm": 0.3319443464279175, - "learning_rate": 2.934673366834171e-05, - "loss": 1.2585, - "step": 854 - }, - { - "epoch": 0.39, - "grad_norm": 0.33522048592567444, - "learning_rate": 2.914572864321608e-05, - "loss": 1.2501, - "step": 855 - }, - { - "epoch": 0.39, - "grad_norm": 0.3560466766357422, - "learning_rate": 2.8944723618090452e-05, - "loss": 1.1548, - "step": 856 - }, - { - "epoch": 0.39, - "grad_norm": 0.3441299796104431, - "learning_rate": 2.8743718592964825e-05, - "loss": 1.3083, - "step": 857 - }, - { - "epoch": 0.39, - "grad_norm": 0.32371416687965393, - "learning_rate": 2.8542713567839198e-05, - "loss": 1.3437, - "step": 858 - }, - { - "epoch": 0.39, - "grad_norm": 0.34365978837013245, - "learning_rate": 2.8341708542713568e-05, - "loss": 1.2776, - "step": 859 - }, - { - "epoch": 0.39, - "grad_norm": 0.34733307361602783, - "learning_rate": 2.814070351758794e-05, - "loss": 1.261, - "step": 860 - }, - { - "epoch": 0.39, - "grad_norm": 0.3306278884410858, - "learning_rate": 2.7939698492462314e-05, - "loss": 1.1925, - "step": 861 - }, - { - "epoch": 0.39, - "grad_norm": 0.3588391840457916, - "learning_rate": 2.7738693467336686e-05, - "loss": 1.2524, - "step": 862 - }, - { - "epoch": 0.39, - "grad_norm": 0.32391244173049927, - "learning_rate": 2.7537688442211056e-05, - "loss": 1.2479, - "step": 863 - }, - { - "epoch": 0.39, - "grad_norm": 0.37597063183784485, - "learning_rate": 2.733668341708543e-05, - "loss": 1.2458, - "step": 864 - }, - { - "epoch": 0.4, - "grad_norm": 0.32394328713417053, - "learning_rate": 2.7135678391959802e-05, - "loss": 1.2868, - "step": 865 - }, - { - "epoch": 0.4, - "grad_norm": 0.3270561993122101, - "learning_rate": 2.6934673366834175e-05, - "loss": 1.1915, - "step": 866 - }, - { - "epoch": 0.4, - "grad_norm": 0.36114001274108887, - "learning_rate": 2.6733668341708545e-05, - "loss": 1.1831, - "step": 867 - }, - { - "epoch": 0.4, - "grad_norm": 0.33239543437957764, - "learning_rate": 2.6532663316582917e-05, - "loss": 1.2756, - "step": 868 - }, - { - "epoch": 0.4, - "grad_norm": 0.3513084352016449, - "learning_rate": 2.633165829145729e-05, - "loss": 1.1912, - "step": 869 - }, - { - "epoch": 0.4, - "grad_norm": 0.3425911068916321, - "learning_rate": 2.613065326633166e-05, - "loss": 1.1919, - "step": 870 - }, - { - "epoch": 0.4, - "grad_norm": 0.35736554861068726, - "learning_rate": 2.5929648241206033e-05, - "loss": 1.3152, - "step": 871 - }, - { - "epoch": 0.4, - "grad_norm": 0.33599066734313965, - "learning_rate": 2.5728643216080406e-05, - "loss": 1.1916, - "step": 872 - }, - { - "epoch": 0.4, - "grad_norm": 0.33452367782592773, - "learning_rate": 2.5527638190954772e-05, - "loss": 1.1698, - "step": 873 - }, - { - "epoch": 0.4, - "grad_norm": 0.3205559253692627, - "learning_rate": 2.5326633165829145e-05, - "loss": 1.2197, - "step": 874 - }, - { - "epoch": 0.4, - "grad_norm": 0.337720662355423, - "learning_rate": 2.5125628140703518e-05, - "loss": 1.2132, - "step": 875 - }, - { - "epoch": 0.4, - "grad_norm": 0.4287751615047455, - "learning_rate": 2.492462311557789e-05, - "loss": 1.2187, - "step": 876 - }, - { - "epoch": 0.4, - "grad_norm": 0.33593013882637024, - "learning_rate": 2.4723618090452264e-05, - "loss": 1.293, - "step": 877 - }, - { - "epoch": 0.4, - "grad_norm": 0.33557310700416565, - "learning_rate": 2.4522613065326637e-05, - "loss": 1.2706, - "step": 878 - }, - { - "epoch": 0.4, - "grad_norm": 0.33586448431015015, - "learning_rate": 2.4321608040201007e-05, - "loss": 1.1899, - "step": 879 - }, - { - "epoch": 0.4, - "grad_norm": 0.3143567740917206, - "learning_rate": 2.4120603015075376e-05, - "loss": 1.2161, - "step": 880 - }, - { - "epoch": 0.4, - "grad_norm": 0.33706820011138916, - "learning_rate": 2.391959798994975e-05, - "loss": 1.2193, - "step": 881 - }, - { - "epoch": 0.4, - "grad_norm": 0.31664061546325684, - "learning_rate": 2.3718592964824122e-05, - "loss": 1.174, - "step": 882 - }, - { - "epoch": 0.4, - "grad_norm": 0.33685946464538574, - "learning_rate": 2.351758793969849e-05, - "loss": 1.172, - "step": 883 - }, - { - "epoch": 0.4, - "grad_norm": 0.33722755312919617, - "learning_rate": 2.3316582914572865e-05, - "loss": 1.2743, - "step": 884 - }, - { - "epoch": 0.4, - "grad_norm": 0.32739368081092834, - "learning_rate": 2.3115577889447238e-05, - "loss": 1.2713, - "step": 885 - }, - { - "epoch": 0.41, - "grad_norm": 0.34132125973701477, - "learning_rate": 2.291457286432161e-05, - "loss": 1.3329, - "step": 886 - }, - { - "epoch": 0.41, - "grad_norm": 0.3514713943004608, - "learning_rate": 2.271356783919598e-05, - "loss": 1.2198, - "step": 887 - }, - { - "epoch": 0.41, - "grad_norm": 0.3208943009376526, - "learning_rate": 2.2512562814070353e-05, - "loss": 1.2092, - "step": 888 - }, - { - "epoch": 0.41, - "grad_norm": 0.32605454325675964, - "learning_rate": 2.2311557788944726e-05, - "loss": 1.2181, - "step": 889 - }, - { - "epoch": 0.41, - "grad_norm": 0.3214203119277954, - "learning_rate": 2.21105527638191e-05, - "loss": 1.068, - "step": 890 - }, - { - "epoch": 0.41, - "grad_norm": 0.3456685543060303, - "learning_rate": 2.190954773869347e-05, - "loss": 1.2062, - "step": 891 - }, - { - "epoch": 0.41, - "grad_norm": 0.33764395117759705, - "learning_rate": 2.1708542713567838e-05, - "loss": 1.2975, - "step": 892 - }, - { - "epoch": 0.41, - "grad_norm": 0.3418406844139099, - "learning_rate": 2.150753768844221e-05, - "loss": 1.3145, - "step": 893 - }, - { - "epoch": 0.41, - "grad_norm": 0.32421067357063293, - "learning_rate": 2.1306532663316584e-05, - "loss": 1.2247, - "step": 894 - }, - { - "epoch": 0.41, - "grad_norm": 0.32601818442344666, - "learning_rate": 2.1105527638190957e-05, - "loss": 1.2817, - "step": 895 - }, - { - "epoch": 0.41, - "grad_norm": 0.3486829698085785, - "learning_rate": 2.0904522613065327e-05, - "loss": 1.215, - "step": 896 - }, - { - "epoch": 0.41, - "grad_norm": 0.31031641364097595, - "learning_rate": 2.07035175879397e-05, - "loss": 1.266, - "step": 897 - }, - { - "epoch": 0.41, - "grad_norm": 0.36151039600372314, - "learning_rate": 2.0502512562814073e-05, - "loss": 1.3193, - "step": 898 - }, - { - "epoch": 0.41, - "grad_norm": 0.33280953764915466, - "learning_rate": 2.0301507537688446e-05, - "loss": 1.2555, - "step": 899 - }, - { - "epoch": 0.41, - "grad_norm": 0.34150639176368713, - "learning_rate": 2.0100502512562815e-05, - "loss": 1.2865, - "step": 900 - }, - { - "epoch": 0.41, - "grad_norm": 0.3205152153968811, - "learning_rate": 1.9899497487437188e-05, - "loss": 1.1683, - "step": 901 - }, - { - "epoch": 0.41, - "grad_norm": 0.32988911867141724, - "learning_rate": 1.9698492462311558e-05, - "loss": 1.2357, - "step": 902 - }, - { - "epoch": 0.41, - "grad_norm": 0.31103867292404175, - "learning_rate": 1.949748743718593e-05, - "loss": 1.1425, - "step": 903 - }, - { - "epoch": 0.41, - "grad_norm": 0.3254905939102173, - "learning_rate": 1.92964824120603e-05, - "loss": 1.2725, - "step": 904 - }, - { - "epoch": 0.41, - "grad_norm": 0.34088942408561707, - "learning_rate": 1.9095477386934673e-05, - "loss": 1.2719, - "step": 905 - }, - { - "epoch": 0.41, - "grad_norm": 0.331760048866272, - "learning_rate": 1.8894472361809046e-05, - "loss": 1.2883, - "step": 906 - }, - { - "epoch": 0.41, - "grad_norm": 0.32167789340019226, - "learning_rate": 1.869346733668342e-05, - "loss": 1.1878, - "step": 907 - }, - { - "epoch": 0.42, - "grad_norm": 0.31863799691200256, - "learning_rate": 1.849246231155779e-05, - "loss": 1.2426, - "step": 908 - }, - { - "epoch": 0.42, - "grad_norm": 0.31977197527885437, - "learning_rate": 1.829145728643216e-05, - "loss": 1.2317, - "step": 909 - }, - { - "epoch": 0.42, - "grad_norm": 0.32206329703330994, - "learning_rate": 1.8090452261306535e-05, - "loss": 1.2461, - "step": 910 - }, - { - "epoch": 0.42, - "grad_norm": 0.3464859127998352, - "learning_rate": 1.7889447236180908e-05, - "loss": 1.2019, - "step": 911 - }, - { - "epoch": 0.42, - "grad_norm": 0.3386858105659485, - "learning_rate": 1.7688442211055277e-05, - "loss": 1.1637, - "step": 912 - }, - { - "epoch": 0.42, - "grad_norm": 0.33315742015838623, - "learning_rate": 1.748743718592965e-05, - "loss": 1.2268, - "step": 913 - }, - { - "epoch": 0.42, - "grad_norm": 0.3428650200366974, - "learning_rate": 1.728643216080402e-05, - "loss": 1.1709, - "step": 914 - }, - { - "epoch": 0.42, - "grad_norm": 0.32808947563171387, - "learning_rate": 1.7085427135678393e-05, - "loss": 1.246, - "step": 915 - }, - { - "epoch": 0.42, - "grad_norm": 0.3213931918144226, - "learning_rate": 1.6884422110552762e-05, - "loss": 1.1798, - "step": 916 - }, - { - "epoch": 0.42, - "grad_norm": 0.32325300574302673, - "learning_rate": 1.6683417085427135e-05, - "loss": 1.2045, - "step": 917 - }, - { - "epoch": 0.42, - "grad_norm": 0.34444937109947205, - "learning_rate": 1.6482412060301508e-05, - "loss": 1.1657, - "step": 918 - }, - { - "epoch": 0.42, - "grad_norm": 0.34642207622528076, - "learning_rate": 1.628140703517588e-05, - "loss": 1.2589, - "step": 919 - }, - { - "epoch": 0.42, - "grad_norm": 0.321769654750824, - "learning_rate": 1.608040201005025e-05, - "loss": 1.2896, - "step": 920 - }, - { - "epoch": 0.42, - "grad_norm": 0.32652974128723145, - "learning_rate": 1.5879396984924624e-05, - "loss": 1.239, - "step": 921 - }, - { - "epoch": 0.42, - "grad_norm": 0.32902684807777405, - "learning_rate": 1.5678391959798997e-05, - "loss": 1.1761, - "step": 922 - }, - { - "epoch": 0.42, - "grad_norm": 0.3562852442264557, - "learning_rate": 1.547738693467337e-05, - "loss": 1.2119, - "step": 923 - }, - { - "epoch": 0.42, - "grad_norm": 0.32487592101097107, - "learning_rate": 1.527638190954774e-05, - "loss": 1.2713, - "step": 924 - }, - { - "epoch": 0.42, - "grad_norm": 0.3527469038963318, - "learning_rate": 1.507537688442211e-05, - "loss": 1.2803, - "step": 925 - }, - { - "epoch": 0.42, - "grad_norm": 0.32758113741874695, - "learning_rate": 1.4874371859296482e-05, - "loss": 1.239, - "step": 926 - }, - { - "epoch": 0.42, - "grad_norm": 0.3581792116165161, - "learning_rate": 1.4673366834170855e-05, - "loss": 1.1578, - "step": 927 - }, - { - "epoch": 0.42, - "grad_norm": 0.3269941806793213, - "learning_rate": 1.4472361809045226e-05, - "loss": 1.2673, - "step": 928 - }, - { - "epoch": 0.42, - "grad_norm": 0.33714571595191956, - "learning_rate": 1.4271356783919599e-05, - "loss": 1.1647, - "step": 929 - }, - { - "epoch": 0.43, - "grad_norm": 0.3444203734397888, - "learning_rate": 1.407035175879397e-05, - "loss": 1.2443, - "step": 930 - }, - { - "epoch": 0.43, - "grad_norm": 0.3266572952270508, - "learning_rate": 1.3869346733668343e-05, - "loss": 1.2083, - "step": 931 - }, - { - "epoch": 0.43, - "grad_norm": 0.33283984661102295, - "learning_rate": 1.3668341708542715e-05, - "loss": 1.2936, - "step": 932 - }, - { - "epoch": 0.43, - "grad_norm": 0.31751376390457153, - "learning_rate": 1.3467336683417087e-05, - "loss": 1.2752, - "step": 933 - }, - { - "epoch": 0.43, - "grad_norm": 0.31209132075309753, - "learning_rate": 1.3266331658291459e-05, - "loss": 1.2414, - "step": 934 - }, - { - "epoch": 0.43, - "grad_norm": 0.33048173785209656, - "learning_rate": 1.306532663316583e-05, - "loss": 1.2501, - "step": 935 - }, - { - "epoch": 0.43, - "grad_norm": 0.35573264956474304, - "learning_rate": 1.2864321608040203e-05, - "loss": 1.2298, - "step": 936 - }, - { - "epoch": 0.43, - "grad_norm": 0.34101277589797974, - "learning_rate": 1.2663316582914573e-05, - "loss": 1.2113, - "step": 937 - }, - { - "epoch": 0.43, - "grad_norm": 0.3265811502933502, - "learning_rate": 1.2462311557788946e-05, - "loss": 1.2443, - "step": 938 - }, - { - "epoch": 0.43, - "grad_norm": 0.3247004449367523, - "learning_rate": 1.2261306532663318e-05, - "loss": 1.2098, - "step": 939 - }, - { - "epoch": 0.43, - "grad_norm": 0.3484182357788086, - "learning_rate": 1.2060301507537688e-05, - "loss": 1.2731, - "step": 940 - }, - { - "epoch": 0.43, - "grad_norm": 0.3221377432346344, - "learning_rate": 1.1859296482412061e-05, - "loss": 1.1475, - "step": 941 - }, - { - "epoch": 0.43, - "grad_norm": 0.2977326512336731, - "learning_rate": 1.1658291457286432e-05, - "loss": 1.2007, - "step": 942 - }, - { - "epoch": 0.43, - "grad_norm": 0.3372350335121155, - "learning_rate": 1.1457286432160805e-05, - "loss": 1.3172, - "step": 943 - }, - { - "epoch": 0.43, - "grad_norm": 0.33364465832710266, - "learning_rate": 1.1256281407035177e-05, - "loss": 1.1503, - "step": 944 - }, - { - "epoch": 0.43, - "grad_norm": 0.34927648305892944, - "learning_rate": 1.105527638190955e-05, - "loss": 1.2117, - "step": 945 - }, - { - "epoch": 0.43, - "grad_norm": 0.3644360601902008, - "learning_rate": 1.0854271356783919e-05, - "loss": 1.2228, - "step": 946 - }, - { - "epoch": 0.43, - "grad_norm": 0.32903411984443665, - "learning_rate": 1.0653266331658292e-05, - "loss": 1.1774, - "step": 947 - }, - { - "epoch": 0.43, - "grad_norm": 0.31273844838142395, - "learning_rate": 1.0452261306532663e-05, - "loss": 1.2441, - "step": 948 - }, - { - "epoch": 0.43, - "grad_norm": 0.3472016453742981, - "learning_rate": 1.0251256281407036e-05, - "loss": 1.2373, - "step": 949 - }, - { - "epoch": 0.43, - "grad_norm": 0.33334019780158997, - "learning_rate": 1.0050251256281408e-05, - "loss": 1.2008, - "step": 950 - }, - { - "epoch": 0.43, - "grad_norm": 0.3392302393913269, - "learning_rate": 9.849246231155779e-06, - "loss": 1.2857, - "step": 951 - }, - { - "epoch": 0.44, - "grad_norm": 0.3528233468532562, - "learning_rate": 9.64824120603015e-06, - "loss": 1.2599, - "step": 952 - }, - { - "epoch": 0.44, - "grad_norm": 0.32364121079444885, - "learning_rate": 9.447236180904523e-06, - "loss": 1.215, - "step": 953 - }, - { - "epoch": 0.44, - "grad_norm": 0.3271343410015106, - "learning_rate": 9.246231155778894e-06, - "loss": 1.2212, - "step": 954 - }, - { - "epoch": 0.44, - "grad_norm": 0.3110508620738983, - "learning_rate": 9.045226130653267e-06, - "loss": 1.2951, - "step": 955 - }, - { - "epoch": 0.44, - "grad_norm": 0.3501865267753601, - "learning_rate": 8.844221105527639e-06, - "loss": 1.2195, - "step": 956 - }, - { - "epoch": 0.44, - "grad_norm": 0.35401347279548645, - "learning_rate": 8.64321608040201e-06, - "loss": 1.1406, - "step": 957 - }, - { - "epoch": 0.44, - "grad_norm": 0.3467148542404175, - "learning_rate": 8.442211055276381e-06, - "loss": 1.2076, - "step": 958 - }, - { - "epoch": 0.44, - "grad_norm": 0.5495727062225342, - "learning_rate": 8.241206030150754e-06, - "loss": 1.2051, - "step": 959 - }, - { - "epoch": 0.44, - "grad_norm": 0.34182611107826233, - "learning_rate": 8.040201005025125e-06, - "loss": 1.2087, - "step": 960 - }, - { - "epoch": 0.44, - "grad_norm": 0.33628034591674805, - "learning_rate": 7.839195979899498e-06, - "loss": 1.1792, - "step": 961 - }, - { - "epoch": 0.44, - "grad_norm": 0.3278791904449463, - "learning_rate": 7.63819095477387e-06, - "loss": 1.2596, - "step": 962 - }, - { - "epoch": 0.44, - "grad_norm": 0.35457128286361694, - "learning_rate": 7.437185929648241e-06, - "loss": 1.2581, - "step": 963 - }, - { - "epoch": 0.44, - "grad_norm": 0.3405055105686188, - "learning_rate": 7.236180904522613e-06, - "loss": 1.2284, - "step": 964 - }, - { - "epoch": 0.44, - "grad_norm": 0.3680908679962158, - "learning_rate": 7.035175879396985e-06, - "loss": 1.2552, - "step": 965 - }, - { - "epoch": 0.44, - "grad_norm": 0.34292829036712646, - "learning_rate": 6.834170854271357e-06, - "loss": 1.1868, - "step": 966 - }, - { - "epoch": 0.44, - "grad_norm": 0.35092589259147644, - "learning_rate": 6.633165829145729e-06, - "loss": 1.2687, - "step": 967 - }, - { - "epoch": 0.44, - "grad_norm": 0.34995222091674805, - "learning_rate": 6.4321608040201015e-06, - "loss": 1.221, - "step": 968 - }, - { - "epoch": 0.44, - "grad_norm": 0.32289987802505493, - "learning_rate": 6.231155778894473e-06, - "loss": 1.2722, - "step": 969 - }, - { - "epoch": 0.44, - "grad_norm": 0.3334210515022278, - "learning_rate": 6.030150753768844e-06, - "loss": 1.2668, - "step": 970 - }, - { - "epoch": 0.44, - "grad_norm": 0.3216976523399353, - "learning_rate": 5.829145728643216e-06, - "loss": 1.3364, - "step": 971 - }, - { - "epoch": 0.44, - "grad_norm": 0.33897489309310913, - "learning_rate": 5.628140703517588e-06, - "loss": 1.1858, - "step": 972 - }, - { - "epoch": 0.44, - "grad_norm": 0.33176472783088684, - "learning_rate": 5.4271356783919595e-06, - "loss": 1.2104, - "step": 973 - }, - { - "epoch": 0.45, - "grad_norm": 0.32903414964675903, - "learning_rate": 5.226130653266332e-06, - "loss": 1.2193, - "step": 974 - }, - { - "epoch": 0.45, - "grad_norm": 0.3290814459323883, - "learning_rate": 5.025125628140704e-06, - "loss": 1.2564, - "step": 975 - }, - { - "epoch": 0.45, - "grad_norm": 0.32403564453125, - "learning_rate": 4.824120603015075e-06, - "loss": 1.1738, - "step": 976 - }, - { - "epoch": 0.45, - "grad_norm": 0.3256121277809143, - "learning_rate": 4.623115577889447e-06, - "loss": 1.2547, - "step": 977 - }, - { - "epoch": 0.45, - "grad_norm": 0.3145628571510315, - "learning_rate": 4.422110552763819e-06, - "loss": 1.196, - "step": 978 - }, - { - "epoch": 0.45, - "grad_norm": 0.3379940092563629, - "learning_rate": 4.2211055276381906e-06, - "loss": 1.3159, - "step": 979 - }, - { - "epoch": 0.45, - "grad_norm": 0.33039796352386475, - "learning_rate": 4.020100502512563e-06, - "loss": 1.1869, - "step": 980 - }, - { - "epoch": 0.45, - "grad_norm": 0.3251807391643524, - "learning_rate": 3.819095477386935e-06, - "loss": 1.2726, - "step": 981 - }, - { - "epoch": 0.45, - "grad_norm": 0.33046984672546387, - "learning_rate": 3.6180904522613065e-06, - "loss": 1.2509, - "step": 982 - }, - { - "epoch": 0.45, - "grad_norm": 0.32313692569732666, - "learning_rate": 3.4170854271356786e-06, - "loss": 1.308, - "step": 983 - }, - { - "epoch": 0.45, - "grad_norm": 0.34376001358032227, - "learning_rate": 3.2160804020100507e-06, - "loss": 1.3215, - "step": 984 - }, - { - "epoch": 0.45, - "grad_norm": 0.33600521087646484, - "learning_rate": 3.015075376884422e-06, - "loss": 1.2346, - "step": 985 - }, - { - "epoch": 0.45, - "grad_norm": 0.33546069264411926, - "learning_rate": 2.814070351758794e-06, - "loss": 1.2924, - "step": 986 - }, - { - "epoch": 0.45, - "grad_norm": 0.3507489264011383, - "learning_rate": 2.613065326633166e-06, - "loss": 1.2574, - "step": 987 - }, - { - "epoch": 0.45, - "grad_norm": 0.3476717472076416, - "learning_rate": 2.4120603015075375e-06, - "loss": 1.2874, - "step": 988 - }, - { - "epoch": 0.45, - "grad_norm": 0.3193936347961426, - "learning_rate": 2.2110552763819096e-06, - "loss": 1.2534, - "step": 989 - }, - { - "epoch": 0.45, - "grad_norm": 0.35533928871154785, - "learning_rate": 2.0100502512562813e-06, - "loss": 1.2738, - "step": 990 - }, - { - "epoch": 0.45, - "grad_norm": 0.33337244391441345, - "learning_rate": 1.8090452261306533e-06, - "loss": 1.261, - "step": 991 - }, - { - "epoch": 0.45, - "grad_norm": 0.34682735800743103, - "learning_rate": 1.6080402010050254e-06, - "loss": 1.248, - "step": 992 - }, - { - "epoch": 0.45, - "grad_norm": 0.33268532156944275, - "learning_rate": 1.407035175879397e-06, - "loss": 1.1575, - "step": 993 - }, - { - "epoch": 0.45, - "grad_norm": 0.3305032253265381, - "learning_rate": 1.2060301507537688e-06, - "loss": 1.2118, - "step": 994 - }, - { - "epoch": 0.45, - "grad_norm": 0.3230968117713928, - "learning_rate": 1.0050251256281407e-06, - "loss": 1.1977, - "step": 995 - }, - { - "epoch": 0.46, - "grad_norm": 0.3403209447860718, - "learning_rate": 8.040201005025127e-07, - "loss": 1.2415, - "step": 996 - }, - { - "epoch": 0.46, - "grad_norm": 0.31072068214416504, - "learning_rate": 6.030150753768844e-07, - "loss": 1.1967, - "step": 997 - }, - { - "epoch": 0.46, - "grad_norm": 0.3351586163043976, - "learning_rate": 4.0201005025125634e-07, - "loss": 1.208, - "step": 998 - }, - { - "epoch": 0.46, - "grad_norm": 0.33677101135253906, - "learning_rate": 2.0100502512562817e-07, - "loss": 1.2926, - "step": 999 - }, - { - "epoch": 0.46, - "grad_norm": 0.332905113697052, - "learning_rate": 0.0, - "loss": 1.1932, - "step": 1000 - } - ], - "logging_steps": 1, - "max_steps": 1000, - "num_input_tokens_seen": 0, - "num_train_epochs": 1, - "save_steps": 200, - "total_flos": 4.636526208766771e+17, - "train_batch_size": 4, - "trial_name": null, - "trial_params": null -}