|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9591836734693877, |
|
"eval_steps": 500, |
|
"global_step": 300, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0326530612244898, |
|
"grad_norm": 0.3159657120704651, |
|
"learning_rate": 4.9985361990992455e-05, |
|
"loss": 0.1654, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0653061224489796, |
|
"grad_norm": 0.32706642150878906, |
|
"learning_rate": 4.9941465105674435e-05, |
|
"loss": 0.1369, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09795918367346938, |
|
"grad_norm": 0.33407843112945557, |
|
"learning_rate": 4.986836074908616e-05, |
|
"loss": 0.1259, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1306122448979592, |
|
"grad_norm": 0.3189881443977356, |
|
"learning_rate": 4.976613452940604e-05, |
|
"loss": 0.1041, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.16326530612244897, |
|
"grad_norm": 0.3424989581108093, |
|
"learning_rate": 4.9634906157700036e-05, |
|
"loss": 0.1004, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.19591836734693877, |
|
"grad_norm": 0.3253389298915863, |
|
"learning_rate": 4.9474829307735115e-05, |
|
"loss": 0.0941, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.22857142857142856, |
|
"grad_norm": 0.26078635454177856, |
|
"learning_rate": 4.9286091436021015e-05, |
|
"loss": 0.0867, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.2612244897959184, |
|
"grad_norm": 0.252139687538147, |
|
"learning_rate": 4.906891356229103e-05, |
|
"loss": 0.0853, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2938775510204082, |
|
"grad_norm": 0.3403972387313843, |
|
"learning_rate": 4.882355001067892e-05, |
|
"loss": 0.0863, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.32653061224489793, |
|
"grad_norm": 0.4710679352283478, |
|
"learning_rate": 4.855028811189496e-05, |
|
"loss": 0.0874, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.35918367346938773, |
|
"grad_norm": 0.3147217929363251, |
|
"learning_rate": 4.8249447866750025e-05, |
|
"loss": 0.0733, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.39183673469387753, |
|
"grad_norm": 0.3265310823917389, |
|
"learning_rate": 4.792138157142158e-05, |
|
"loss": 0.0719, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.42448979591836733, |
|
"grad_norm": 0.35432252287864685, |
|
"learning_rate": 4.75664734049005e-05, |
|
"loss": 0.0824, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.45714285714285713, |
|
"grad_norm": 0.3701626658439636, |
|
"learning_rate": 4.7185138979101864e-05, |
|
"loss": 0.0731, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4897959183673469, |
|
"grad_norm": 0.35868266224861145, |
|
"learning_rate": 4.677782485216644e-05, |
|
"loss": 0.0725, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.5224489795918368, |
|
"grad_norm": 0.32440632581710815, |
|
"learning_rate": 4.6345008005522966e-05, |
|
"loss": 0.0694, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5551020408163265, |
|
"grad_norm": 0.3003002405166626, |
|
"learning_rate": 4.588719528532342e-05, |
|
"loss": 0.072, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5877551020408164, |
|
"grad_norm": 0.34989920258522034, |
|
"learning_rate": 4.540492280890555e-05, |
|
"loss": 0.0646, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6204081632653061, |
|
"grad_norm": 0.473254919052124, |
|
"learning_rate": 4.4898755336977673e-05, |
|
"loss": 0.0732, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6530612244897959, |
|
"grad_norm": 0.30768489837646484, |
|
"learning_rate": 4.436928561226087e-05, |
|
"loss": 0.068, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6857142857142857, |
|
"grad_norm": 0.31777673959732056, |
|
"learning_rate": 4.381713366536311e-05, |
|
"loss": 0.0749, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.7183673469387755, |
|
"grad_norm": 0.40202295780181885, |
|
"learning_rate": 4.324294608869817e-05, |
|
"loss": 0.0652, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7510204081632653, |
|
"grad_norm": 0.4353463053703308, |
|
"learning_rate": 4.264739527929959e-05, |
|
"loss": 0.0562, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7836734693877551, |
|
"grad_norm": 0.4079282879829407, |
|
"learning_rate": 4.203117865141635e-05, |
|
"loss": 0.0602, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8163265306122449, |
|
"grad_norm": 0.40490931272506714, |
|
"learning_rate": 4.1395017819812445e-05, |
|
"loss": 0.0639, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8489795918367347, |
|
"grad_norm": 0.423981636762619, |
|
"learning_rate": 4.07396577547265e-05, |
|
"loss": 0.0651, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8816326530612245, |
|
"grad_norm": 0.3315489590167999, |
|
"learning_rate": 4.0065865909481417e-05, |
|
"loss": 0.0676, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.9142857142857143, |
|
"grad_norm": 0.560667097568512, |
|
"learning_rate": 3.937443132176517e-05, |
|
"loss": 0.0669, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9469387755102041, |
|
"grad_norm": 0.4202517569065094, |
|
"learning_rate": 3.8666163689635616e-05, |
|
"loss": 0.0631, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9795918367346939, |
|
"grad_norm": 0.4563729465007782, |
|
"learning_rate": 3.794189242333106e-05, |
|
"loss": 0.0649, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0122448979591836, |
|
"grad_norm": 0.3522864580154419, |
|
"learning_rate": 3.720246567399712e-05, |
|
"loss": 0.059, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0448979591836736, |
|
"grad_norm": 0.38459908962249756, |
|
"learning_rate": 3.644874934046716e-05, |
|
"loss": 0.062, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0775510204081633, |
|
"grad_norm": 0.37406954169273376, |
|
"learning_rate": 3.568162605525953e-05, |
|
"loss": 0.0565, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.110204081632653, |
|
"grad_norm": 0.3395706117153168, |
|
"learning_rate": 3.490199415097892e-05, |
|
"loss": 0.0575, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.31273558735847473, |
|
"learning_rate": 3.4110766608332347e-05, |
|
"loss": 0.0589, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.1755102040816325, |
|
"grad_norm": 0.44675061106681824, |
|
"learning_rate": 3.330886998699149e-05, |
|
"loss": 0.0611, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.2081632653061225, |
|
"grad_norm": 0.34769827127456665, |
|
"learning_rate": 3.249724334055367e-05, |
|
"loss": 0.062, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2408163265306122, |
|
"grad_norm": 0.31832146644592285, |
|
"learning_rate": 3.167683711687179e-05, |
|
"loss": 0.0616, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.273469387755102, |
|
"grad_norm": 0.4153653085231781, |
|
"learning_rate": 3.084861204504122e-05, |
|
"loss": 0.0586, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.306122448979592, |
|
"grad_norm": 0.36249643564224243, |
|
"learning_rate": 3.001353801034688e-05, |
|
"loss": 0.0578, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3387755102040817, |
|
"grad_norm": 0.43447113037109375, |
|
"learning_rate": 2.917259291848814e-05, |
|
"loss": 0.0547, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3714285714285714, |
|
"grad_norm": 0.3350294530391693, |
|
"learning_rate": 2.8326761550411345e-05, |
|
"loss": 0.0581, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4040816326530612, |
|
"grad_norm": 0.34255513548851013, |
|
"learning_rate": 2.747703440909128e-05, |
|
"loss": 0.0535, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.436734693877551, |
|
"grad_norm": 0.6323994398117065, |
|
"learning_rate": 2.662440655961183e-05, |
|
"loss": 0.0666, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.469387755102041, |
|
"grad_norm": 0.3421262502670288, |
|
"learning_rate": 2.5769876463904265e-05, |
|
"loss": 0.053, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.5020408163265306, |
|
"grad_norm": 0.4133753478527069, |
|
"learning_rate": 2.491444481150763e-05, |
|
"loss": 0.0602, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5346938775510204, |
|
"grad_norm": 0.37677302956581116, |
|
"learning_rate": 2.4059113347720574e-05, |
|
"loss": 0.0573, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5673469387755103, |
|
"grad_norm": 0.3683429956436157, |
|
"learning_rate": 2.3204883700516812e-05, |
|
"loss": 0.0544, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.34340280294418335, |
|
"learning_rate": 2.235275620759797e-05, |
|
"loss": 0.0552, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.6326530612244898, |
|
"grad_norm": 0.39541885256767273, |
|
"learning_rate": 2.150372874495739e-05, |
|
"loss": 0.0543, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6653061224489796, |
|
"grad_norm": 0.384330689907074, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 0.0481, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6979591836734693, |
|
"grad_norm": 0.4634612798690796, |
|
"learning_rate": 1.9818946098873766e-05, |
|
"loss": 0.052, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.730612244897959, |
|
"grad_norm": 0.4186592400074005, |
|
"learning_rate": 1.8985163864514645e-05, |
|
"loss": 0.0516, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.763265306122449, |
|
"grad_norm": 0.4726428985595703, |
|
"learning_rate": 1.815842524819793e-05, |
|
"loss": 0.0559, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7959183673469388, |
|
"grad_norm": 0.38660183548927307, |
|
"learning_rate": 1.733969839450863e-05, |
|
"loss": 0.0554, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.8285714285714287, |
|
"grad_norm": 0.40636590123176575, |
|
"learning_rate": 1.6529942065931477e-05, |
|
"loss": 0.0576, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8612244897959185, |
|
"grad_norm": 0.3667762875556946, |
|
"learning_rate": 1.5730104520100982e-05, |
|
"loss": 0.0546, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8938775510204082, |
|
"grad_norm": 0.38867512345314026, |
|
"learning_rate": 1.4941122399353185e-05, |
|
"loss": 0.0539, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.926530612244898, |
|
"grad_norm": 0.37944862246513367, |
|
"learning_rate": 1.4163919633879324e-05, |
|
"loss": 0.0537, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9591836734693877, |
|
"grad_norm": 0.394191175699234, |
|
"learning_rate": 1.339940635976592e-05, |
|
"loss": 0.0493, |
|
"step": 300 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 459, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 1.8642398036380877e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|