File size: 5,258 Bytes
2b72519 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 |
{
"best_metric": 0.092567577958107,
"best_model_checkpoint": "outputs/checkpoint-384",
"epoch": 6.997722095671982,
"eval_steps": 500,
"global_step": 384,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.36446469248291574,
"grad_norm": 3.342350721359253,
"learning_rate": 1.8000000000000001e-06,
"loss": 0.3727,
"step": 20
},
{
"epoch": 0.7289293849658315,
"grad_norm": 1.456776738166809,
"learning_rate": 3.8000000000000005e-06,
"loss": 0.2857,
"step": 40
},
{
"epoch": 0.9840546697038725,
"eval_loss": 0.1871582418680191,
"eval_runtime": 48.9194,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 0.429,
"step": 54
},
{
"epoch": 1.0933940774487472,
"grad_norm": 0.6021007895469666,
"learning_rate": 5.8e-06,
"loss": 0.1809,
"step": 60
},
{
"epoch": 1.4578587699316627,
"grad_norm": 0.4066278040409088,
"learning_rate": 7.800000000000002e-06,
"loss": 0.1344,
"step": 80
},
{
"epoch": 1.8223234624145785,
"grad_norm": 1.681560754776001,
"learning_rate": 9.800000000000001e-06,
"loss": 0.1242,
"step": 100
},
{
"epoch": 1.9863325740318907,
"eval_loss": 0.1359640508890152,
"eval_runtime": 48.8699,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 0.43,
"step": 109
},
{
"epoch": 2.1867881548974943,
"grad_norm": 0.459069162607193,
"learning_rate": 9.984149663879994e-06,
"loss": 0.1039,
"step": 120
},
{
"epoch": 2.55125284738041,
"grad_norm": 0.594571590423584,
"learning_rate": 9.929487384240103e-06,
"loss": 0.0874,
"step": 140
},
{
"epoch": 2.9157175398633255,
"grad_norm": 0.40587714314460754,
"learning_rate": 9.83624518217252e-06,
"loss": 0.0862,
"step": 160
},
{
"epoch": 2.988610478359909,
"eval_loss": 0.11714969575405121,
"eval_runtime": 48.9241,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 0.429,
"step": 164
},
{
"epoch": 3.2801822323462413,
"grad_norm": 0.5911823511123657,
"learning_rate": 9.705152804330872e-06,
"loss": 0.0833,
"step": 180
},
{
"epoch": 3.644646924829157,
"grad_norm": 0.6380645036697388,
"learning_rate": 9.53723622631339e-06,
"loss": 0.0754,
"step": 200
},
{
"epoch": 3.990888382687927,
"eval_loss": 0.10540632158517838,
"eval_runtime": 48.8751,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 0.43,
"step": 219
},
{
"epoch": 4.009111617312073,
"grad_norm": 0.5483060479164124,
"learning_rate": 9.333809623012763e-06,
"loss": 0.0731,
"step": 220
},
{
"epoch": 4.373576309794989,
"grad_norm": 0.5601429343223572,
"learning_rate": 9.096465083415809e-06,
"loss": 0.0679,
"step": 240
},
{
"epoch": 4.738041002277904,
"grad_norm": 0.5731651782989502,
"learning_rate": 8.82706015034849e-06,
"loss": 0.0622,
"step": 260
},
{
"epoch": 4.993166287015946,
"eval_loss": 0.09794322401285172,
"eval_runtime": 48.874,
"eval_samples_per_second": 3.437,
"eval_steps_per_second": 0.43,
"step": 274
},
{
"epoch": 5.10250569476082,
"grad_norm": 0.5230545401573181,
"learning_rate": 8.527703282684766e-06,
"loss": 0.062,
"step": 280
},
{
"epoch": 5.466970387243736,
"grad_norm": 0.631493330001831,
"learning_rate": 8.20073735379715e-06,
"loss": 0.0577,
"step": 300
},
{
"epoch": 5.831435079726651,
"grad_norm": 0.7749846577644348,
"learning_rate": 7.848721315395975e-06,
"loss": 0.0576,
"step": 320
},
{
"epoch": 5.995444191343964,
"eval_loss": 0.09521909803152084,
"eval_runtime": 48.8686,
"eval_samples_per_second": 3.438,
"eval_steps_per_second": 0.43,
"step": 329
},
{
"epoch": 6.195899772209567,
"grad_norm": 0.5534420609474182,
"learning_rate": 7.474410170262719e-06,
"loss": 0.0585,
"step": 340
},
{
"epoch": 6.560364464692483,
"grad_norm": 0.7711396217346191,
"learning_rate": 7.080733410617853e-06,
"loss": 0.0516,
"step": 360
},
{
"epoch": 6.924829157175399,
"grad_norm": 0.6441643238067627,
"learning_rate": 6.6707720908722795e-06,
"loss": 0.0464,
"step": 380
},
{
"epoch": 6.997722095671982,
"eval_loss": 0.092567577958107,
"eval_runtime": 48.9182,
"eval_samples_per_second": 3.434,
"eval_steps_per_second": 0.429,
"step": 384
}
],
"logging_steps": 20,
"max_steps": 810,
"num_input_tokens_seen": 0,
"num_train_epochs": 15,
"save_steps": 500,
"total_flos": 1.436627161602048e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}
|