aseratus1 commited on
Commit
353cfad
·
verified ·
1 Parent(s): 1053da1

Training in progress, step 488, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2900591d81d085d6f1a16d411604f9a3883bb62f7ce4c7d8c00ac8e3b37106be
3
  size 140815952
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da6cb8e3ceb59865787e3be19146ecfe8f090e00a5e13cdf927c08d575198fae
3
  size 140815952
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e0f855f277f1cfe633417c3619eeb88bc0fcf9e01649d46a606ce490c582477
3
  size 71878996
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b4957b732848f725fe8e97d4887a86c7374d04e7a61a5f57433336b19c5f05d6
3
  size 71878996
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c86ead3108943626e936579b25ad4d0313259ed7a62597799a2f43e75bbd8bf5
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:41b8611fadb80f1bdbbbba5e4fc8c3638269de0aed2e0197a291cadd4ef1f1ad
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4890e18ed32a9392f71679e2e5ba429d90989986c0245d64b82048dee600522d
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e7fb2e4e3f7c67f2979e17d634f1bb48e2792b76f31dccbd7feec15021f43a0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": 0.813357412815094,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-400",
4
- "epoch": 0.8205128205128205,
5
  "eval_steps": 100,
6
- "global_step": 400,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -2847,6 +2847,622 @@
2847
  "eval_samples_per_second": 58.256,
2848
  "eval_steps_per_second": 14.599,
2849
  "step": 400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2850
  }
2851
  ],
2852
  "logging_steps": 1,
@@ -2870,12 +3486,12 @@
2870
  "should_evaluate": false,
2871
  "should_log": false,
2872
  "should_save": true,
2873
- "should_training_stop": false
2874
  },
2875
  "attributes": {}
2876
  }
2877
  },
2878
- "total_flos": 3.09139297271808e+16,
2879
  "train_batch_size": 8,
2880
  "trial_name": null,
2881
  "trial_params": null
 
1
  {
2
  "best_metric": 0.813357412815094,
3
  "best_model_checkpoint": "miner_id_24/checkpoint-400",
4
+ "epoch": 1.001025641025641,
5
  "eval_steps": 100,
6
+ "global_step": 488,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
2847
  "eval_samples_per_second": 58.256,
2848
  "eval_steps_per_second": 14.599,
2849
  "step": 400
2850
+ },
2851
+ {
2852
+ "epoch": 0.8225641025641026,
2853
+ "grad_norm": 2.1067893505096436,
2854
+ "learning_rate": 8.287193065856935e-06,
2855
+ "loss": 0.857,
2856
+ "step": 401
2857
+ },
2858
+ {
2859
+ "epoch": 0.8246153846153846,
2860
+ "grad_norm": 2.872138261795044,
2861
+ "learning_rate": 8.103069939070945e-06,
2862
+ "loss": 0.9339,
2863
+ "step": 402
2864
+ },
2865
+ {
2866
+ "epoch": 0.8266666666666667,
2867
+ "grad_norm": 2.640960216522217,
2868
+ "learning_rate": 7.920834754120304e-06,
2869
+ "loss": 0.7526,
2870
+ "step": 403
2871
+ },
2872
+ {
2873
+ "epoch": 0.8287179487179487,
2874
+ "grad_norm": 2.455273389816284,
2875
+ "learning_rate": 7.740495722810271e-06,
2876
+ "loss": 0.8467,
2877
+ "step": 404
2878
+ },
2879
+ {
2880
+ "epoch": 0.8307692307692308,
2881
+ "grad_norm": 3.302844285964966,
2882
+ "learning_rate": 7.562060971502383e-06,
2883
+ "loss": 0.7286,
2884
+ "step": 405
2885
+ },
2886
+ {
2887
+ "epoch": 0.8328205128205128,
2888
+ "grad_norm": 2.6610546112060547,
2889
+ "learning_rate": 7.385538540748327e-06,
2890
+ "loss": 0.8152,
2891
+ "step": 406
2892
+ },
2893
+ {
2894
+ "epoch": 0.8348717948717949,
2895
+ "grad_norm": 2.261064291000366,
2896
+ "learning_rate": 7.21093638492763e-06,
2897
+ "loss": 0.6278,
2898
+ "step": 407
2899
+ },
2900
+ {
2901
+ "epoch": 0.8369230769230769,
2902
+ "grad_norm": 3.7655062675476074,
2903
+ "learning_rate": 7.038262371889159e-06,
2904
+ "loss": 0.8202,
2905
+ "step": 408
2906
+ },
2907
+ {
2908
+ "epoch": 0.838974358974359,
2909
+ "grad_norm": 2.8197319507598877,
2910
+ "learning_rate": 6.867524282596655e-06,
2911
+ "loss": 0.8749,
2912
+ "step": 409
2913
+ },
2914
+ {
2915
+ "epoch": 0.841025641025641,
2916
+ "grad_norm": 2.7082679271698,
2917
+ "learning_rate": 6.698729810778065e-06,
2918
+ "loss": 0.7821,
2919
+ "step": 410
2920
+ },
2921
+ {
2922
+ "epoch": 0.8430769230769231,
2923
+ "grad_norm": 2.2520222663879395,
2924
+ "learning_rate": 6.531886562578859e-06,
2925
+ "loss": 0.5002,
2926
+ "step": 411
2927
+ },
2928
+ {
2929
+ "epoch": 0.8451282051282051,
2930
+ "grad_norm": 2.5191712379455566,
2931
+ "learning_rate": 6.367002056219284e-06,
2932
+ "loss": 0.7014,
2933
+ "step": 412
2934
+ },
2935
+ {
2936
+ "epoch": 0.8471794871794872,
2937
+ "grad_norm": 5.02801513671875,
2938
+ "learning_rate": 6.204083721655607e-06,
2939
+ "loss": 0.8951,
2940
+ "step": 413
2941
+ },
2942
+ {
2943
+ "epoch": 0.8492307692307692,
2944
+ "grad_norm": 2.9786057472229004,
2945
+ "learning_rate": 6.043138900245277e-06,
2946
+ "loss": 0.9056,
2947
+ "step": 414
2948
+ },
2949
+ {
2950
+ "epoch": 0.8512820512820513,
2951
+ "grad_norm": 2.791360378265381,
2952
+ "learning_rate": 5.884174844416102e-06,
2953
+ "loss": 0.7512,
2954
+ "step": 415
2955
+ },
2956
+ {
2957
+ "epoch": 0.8533333333333334,
2958
+ "grad_norm": 2.5517685413360596,
2959
+ "learning_rate": 5.727198717339511e-06,
2960
+ "loss": 0.7028,
2961
+ "step": 416
2962
+ },
2963
+ {
2964
+ "epoch": 0.8553846153846154,
2965
+ "grad_norm": 2.8642799854278564,
2966
+ "learning_rate": 5.572217592607687e-06,
2967
+ "loss": 0.8472,
2968
+ "step": 417
2969
+ },
2970
+ {
2971
+ "epoch": 0.8574358974358974,
2972
+ "grad_norm": 2.9429116249084473,
2973
+ "learning_rate": 5.41923845391486e-06,
2974
+ "loss": 0.5711,
2975
+ "step": 418
2976
+ },
2977
+ {
2978
+ "epoch": 0.8594871794871795,
2979
+ "grad_norm": 2.486117124557495,
2980
+ "learning_rate": 5.2682681947426375e-06,
2981
+ "loss": 0.5636,
2982
+ "step": 419
2983
+ },
2984
+ {
2985
+ "epoch": 0.8615384615384616,
2986
+ "grad_norm": 3.1933610439300537,
2987
+ "learning_rate": 5.1193136180493095e-06,
2988
+ "loss": 0.6232,
2989
+ "step": 420
2990
+ },
2991
+ {
2992
+ "epoch": 0.8635897435897436,
2993
+ "grad_norm": 3.0535292625427246,
2994
+ "learning_rate": 4.97238143596333e-06,
2995
+ "loss": 0.7645,
2996
+ "step": 421
2997
+ },
2998
+ {
2999
+ "epoch": 0.8656410256410256,
3000
+ "grad_norm": 2.3928494453430176,
3001
+ "learning_rate": 4.827478269480895e-06,
3002
+ "loss": 0.4642,
3003
+ "step": 422
3004
+ },
3005
+ {
3006
+ "epoch": 0.8676923076923077,
3007
+ "grad_norm": 3.066641330718994,
3008
+ "learning_rate": 4.684610648167503e-06,
3009
+ "loss": 0.8899,
3010
+ "step": 423
3011
+ },
3012
+ {
3013
+ "epoch": 0.8697435897435898,
3014
+ "grad_norm": 2.6845340728759766,
3015
+ "learning_rate": 4.54378500986381e-06,
3016
+ "loss": 0.6358,
3017
+ "step": 424
3018
+ },
3019
+ {
3020
+ "epoch": 0.8717948717948718,
3021
+ "grad_norm": 3.794255495071411,
3022
+ "learning_rate": 4.405007700395497e-06,
3023
+ "loss": 0.7231,
3024
+ "step": 425
3025
+ },
3026
+ {
3027
+ "epoch": 0.8738461538461538,
3028
+ "grad_norm": 3.2792623043060303,
3029
+ "learning_rate": 4.268284973287273e-06,
3030
+ "loss": 0.6754,
3031
+ "step": 426
3032
+ },
3033
+ {
3034
+ "epoch": 0.8758974358974358,
3035
+ "grad_norm": 4.019685745239258,
3036
+ "learning_rate": 4.133622989481145e-06,
3037
+ "loss": 0.8601,
3038
+ "step": 427
3039
+ },
3040
+ {
3041
+ "epoch": 0.877948717948718,
3042
+ "grad_norm": 5.093227386474609,
3043
+ "learning_rate": 4.001027817058789e-06,
3044
+ "loss": 0.9993,
3045
+ "step": 428
3046
+ },
3047
+ {
3048
+ "epoch": 0.88,
3049
+ "grad_norm": 3.0464017391204834,
3050
+ "learning_rate": 3.870505430968069e-06,
3051
+ "loss": 0.6586,
3052
+ "step": 429
3053
+ },
3054
+ {
3055
+ "epoch": 0.882051282051282,
3056
+ "grad_norm": 3.241689920425415,
3057
+ "learning_rate": 3.7420617127538248e-06,
3058
+ "loss": 0.7764,
3059
+ "step": 430
3060
+ },
3061
+ {
3062
+ "epoch": 0.884102564102564,
3063
+ "grad_norm": 3.573296546936035,
3064
+ "learning_rate": 3.615702450292857e-06,
3065
+ "loss": 0.7413,
3066
+ "step": 431
3067
+ },
3068
+ {
3069
+ "epoch": 0.8861538461538462,
3070
+ "grad_norm": 3.759922742843628,
3071
+ "learning_rate": 3.4914333375330898e-06,
3072
+ "loss": 0.7581,
3073
+ "step": 432
3074
+ },
3075
+ {
3076
+ "epoch": 0.8882051282051282,
3077
+ "grad_norm": 3.483435869216919,
3078
+ "learning_rate": 3.369259974236988e-06,
3079
+ "loss": 0.5884,
3080
+ "step": 433
3081
+ },
3082
+ {
3083
+ "epoch": 0.8902564102564102,
3084
+ "grad_norm": 3.6014602184295654,
3085
+ "learning_rate": 3.249187865729264e-06,
3086
+ "loss": 0.7147,
3087
+ "step": 434
3088
+ },
3089
+ {
3090
+ "epoch": 0.8923076923076924,
3091
+ "grad_norm": 3.7031497955322266,
3092
+ "learning_rate": 3.1312224226487442e-06,
3093
+ "loss": 0.7788,
3094
+ "step": 435
3095
+ },
3096
+ {
3097
+ "epoch": 0.8943589743589744,
3098
+ "grad_norm": 3.692850351333618,
3099
+ "learning_rate": 3.0153689607045845e-06,
3100
+ "loss": 0.7269,
3101
+ "step": 436
3102
+ },
3103
+ {
3104
+ "epoch": 0.8964102564102564,
3105
+ "grad_norm": 3.5270211696624756,
3106
+ "learning_rate": 2.901632700436757e-06,
3107
+ "loss": 0.7998,
3108
+ "step": 437
3109
+ },
3110
+ {
3111
+ "epoch": 0.8984615384615384,
3112
+ "grad_norm": 3.858431577682495,
3113
+ "learning_rate": 2.790018766980773e-06,
3114
+ "loss": 0.8085,
3115
+ "step": 438
3116
+ },
3117
+ {
3118
+ "epoch": 0.9005128205128206,
3119
+ "grad_norm": 3.0940706729888916,
3120
+ "learning_rate": 2.680532189836732e-06,
3121
+ "loss": 0.6352,
3122
+ "step": 439
3123
+ },
3124
+ {
3125
+ "epoch": 0.9025641025641026,
3126
+ "grad_norm": 4.406203269958496,
3127
+ "learning_rate": 2.573177902642726e-06,
3128
+ "loss": 0.8474,
3129
+ "step": 440
3130
+ },
3131
+ {
3132
+ "epoch": 0.9046153846153846,
3133
+ "grad_norm": 3.506132125854492,
3134
+ "learning_rate": 2.467960742952463e-06,
3135
+ "loss": 0.5986,
3136
+ "step": 441
3137
+ },
3138
+ {
3139
+ "epoch": 0.9066666666666666,
3140
+ "grad_norm": 4.049818992614746,
3141
+ "learning_rate": 2.3648854520173237e-06,
3142
+ "loss": 0.821,
3143
+ "step": 442
3144
+ },
3145
+ {
3146
+ "epoch": 0.9087179487179488,
3147
+ "grad_norm": 3.5991644859313965,
3148
+ "learning_rate": 2.2639566745727205e-06,
3149
+ "loss": 0.8375,
3150
+ "step": 443
3151
+ },
3152
+ {
3153
+ "epoch": 0.9107692307692308,
3154
+ "grad_norm": 4.406035900115967,
3155
+ "learning_rate": 2.1651789586287442e-06,
3156
+ "loss": 0.797,
3157
+ "step": 444
3158
+ },
3159
+ {
3160
+ "epoch": 0.9128205128205128,
3161
+ "grad_norm": 5.088395595550537,
3162
+ "learning_rate": 2.068556755265272e-06,
3163
+ "loss": 0.8375,
3164
+ "step": 445
3165
+ },
3166
+ {
3167
+ "epoch": 0.9148717948717948,
3168
+ "grad_norm": 7.348681926727295,
3169
+ "learning_rate": 1.974094418431388e-06,
3170
+ "loss": 0.7428,
3171
+ "step": 446
3172
+ },
3173
+ {
3174
+ "epoch": 0.916923076923077,
3175
+ "grad_norm": 5.225501537322998,
3176
+ "learning_rate": 1.8817962047491699e-06,
3177
+ "loss": 0.8446,
3178
+ "step": 447
3179
+ },
3180
+ {
3181
+ "epoch": 0.918974358974359,
3182
+ "grad_norm": 5.816100120544434,
3183
+ "learning_rate": 1.7916662733218847e-06,
3184
+ "loss": 1.1185,
3185
+ "step": 448
3186
+ },
3187
+ {
3188
+ "epoch": 0.921025641025641,
3189
+ "grad_norm": 6.379942417144775,
3190
+ "learning_rate": 1.70370868554659e-06,
3191
+ "loss": 1.1595,
3192
+ "step": 449
3193
+ },
3194
+ {
3195
+ "epoch": 0.9230769230769231,
3196
+ "grad_norm": 11.633496284484863,
3197
+ "learning_rate": 1.6179274049310966e-06,
3198
+ "loss": 1.8986,
3199
+ "step": 450
3200
+ },
3201
+ {
3202
+ "epoch": 0.9251282051282051,
3203
+ "grad_norm": 2.403395175933838,
3204
+ "learning_rate": 1.5343262969153783e-06,
3205
+ "loss": 0.8987,
3206
+ "step": 451
3207
+ },
3208
+ {
3209
+ "epoch": 0.9271794871794872,
3210
+ "grad_norm": 2.5767834186553955,
3211
+ "learning_rate": 1.4529091286973995e-06,
3212
+ "loss": 0.7343,
3213
+ "step": 452
3214
+ },
3215
+ {
3216
+ "epoch": 0.9292307692307692,
3217
+ "grad_norm": 2.610196352005005,
3218
+ "learning_rate": 1.3736795690633354e-06,
3219
+ "loss": 0.808,
3220
+ "step": 453
3221
+ },
3222
+ {
3223
+ "epoch": 0.9312820512820513,
3224
+ "grad_norm": 3.0826358795166016,
3225
+ "learning_rate": 1.2966411882222696e-06,
3226
+ "loss": 0.8493,
3227
+ "step": 454
3228
+ },
3229
+ {
3230
+ "epoch": 0.9333333333333333,
3231
+ "grad_norm": 2.933225154876709,
3232
+ "learning_rate": 1.2217974576453073e-06,
3233
+ "loss": 0.7868,
3234
+ "step": 455
3235
+ },
3236
+ {
3237
+ "epoch": 0.9353846153846154,
3238
+ "grad_norm": 2.5828311443328857,
3239
+ "learning_rate": 1.1491517499091498e-06,
3240
+ "loss": 0.6543,
3241
+ "step": 456
3242
+ },
3243
+ {
3244
+ "epoch": 0.9374358974358974,
3245
+ "grad_norm": 2.710909605026245,
3246
+ "learning_rate": 1.0787073385441048e-06,
3247
+ "loss": 0.6686,
3248
+ "step": 457
3249
+ },
3250
+ {
3251
+ "epoch": 0.9394871794871795,
3252
+ "grad_norm": 2.428332567214966,
3253
+ "learning_rate": 1.0104673978866164e-06,
3254
+ "loss": 0.6812,
3255
+ "step": 458
3256
+ },
3257
+ {
3258
+ "epoch": 0.9415384615384615,
3259
+ "grad_norm": 3.3245909214019775,
3260
+ "learning_rate": 9.44435002936167e-07,
3261
+ "loss": 0.8798,
3262
+ "step": 459
3263
+ },
3264
+ {
3265
+ "epoch": 0.9435897435897436,
3266
+ "grad_norm": 3.67254376411438,
3267
+ "learning_rate": 8.806131292167618e-07,
3268
+ "loss": 0.8829,
3269
+ "step": 460
3270
+ },
3271
+ {
3272
+ "epoch": 0.9456410256410256,
3273
+ "grad_norm": 2.8118197917938232,
3274
+ "learning_rate": 8.190046526428242e-07,
3275
+ "loss": 0.6889,
3276
+ "step": 461
3277
+ },
3278
+ {
3279
+ "epoch": 0.9476923076923077,
3280
+ "grad_norm": 2.5844786167144775,
3281
+ "learning_rate": 7.596123493895991e-07,
3282
+ "loss": 0.8317,
3283
+ "step": 462
3284
+ },
3285
+ {
3286
+ "epoch": 0.9497435897435897,
3287
+ "grad_norm": 3.378085136413574,
3288
+ "learning_rate": 7.024388957680705e-07,
3289
+ "loss": 0.7873,
3290
+ "step": 463
3291
+ },
3292
+ {
3293
+ "epoch": 0.9517948717948718,
3294
+ "grad_norm": 3.895148992538452,
3295
+ "learning_rate": 6.474868681043578e-07,
3296
+ "loss": 0.9465,
3297
+ "step": 464
3298
+ },
3299
+ {
3300
+ "epoch": 0.9538461538461539,
3301
+ "grad_norm": 2.4344851970672607,
3302
+ "learning_rate": 5.947587426236078e-07,
3303
+ "loss": 0.5724,
3304
+ "step": 465
3305
+ },
3306
+ {
3307
+ "epoch": 0.9558974358974359,
3308
+ "grad_norm": 3.014648199081421,
3309
+ "learning_rate": 5.442568953384186e-07,
3310
+ "loss": 0.72,
3311
+ "step": 466
3312
+ },
3313
+ {
3314
+ "epoch": 0.9579487179487179,
3315
+ "grad_norm": 3.841538429260254,
3316
+ "learning_rate": 4.959836019417963e-07,
3317
+ "loss": 0.7759,
3318
+ "step": 467
3319
+ },
3320
+ {
3321
+ "epoch": 0.96,
3322
+ "grad_norm": 2.5754928588867188,
3323
+ "learning_rate": 4.4994103770457653e-07,
3324
+ "loss": 0.579,
3325
+ "step": 468
3326
+ },
3327
+ {
3328
+ "epoch": 0.9620512820512821,
3329
+ "grad_norm": 3.4755969047546387,
3330
+ "learning_rate": 4.06131277377414e-07,
3331
+ "loss": 0.8353,
3332
+ "step": 469
3333
+ },
3334
+ {
3335
+ "epoch": 0.9641025641025641,
3336
+ "grad_norm": 3.1296160221099854,
3337
+ "learning_rate": 3.6455629509730136e-07,
3338
+ "loss": 0.8424,
3339
+ "step": 470
3340
+ },
3341
+ {
3342
+ "epoch": 0.9661538461538461,
3343
+ "grad_norm": 2.8293471336364746,
3344
+ "learning_rate": 3.2521796429859084e-07,
3345
+ "loss": 0.6852,
3346
+ "step": 471
3347
+ },
3348
+ {
3349
+ "epoch": 0.9682051282051282,
3350
+ "grad_norm": 3.9338464736938477,
3351
+ "learning_rate": 2.8811805762860576e-07,
3352
+ "loss": 0.7015,
3353
+ "step": 472
3354
+ },
3355
+ {
3356
+ "epoch": 0.9702564102564103,
3357
+ "grad_norm": 3.2302651405334473,
3358
+ "learning_rate": 2.532582468677214e-07,
3359
+ "loss": 0.7174,
3360
+ "step": 473
3361
+ },
3362
+ {
3363
+ "epoch": 0.9723076923076923,
3364
+ "grad_norm": 3.1608636379241943,
3365
+ "learning_rate": 2.206401028540639e-07,
3366
+ "loss": 0.6316,
3367
+ "step": 474
3368
+ },
3369
+ {
3370
+ "epoch": 0.9743589743589743,
3371
+ "grad_norm": 3.069204807281494,
3372
+ "learning_rate": 1.9026509541272275e-07,
3373
+ "loss": 0.6748,
3374
+ "step": 475
3375
+ },
3376
+ {
3377
+ "epoch": 0.9764102564102564,
3378
+ "grad_norm": 3.582129955291748,
3379
+ "learning_rate": 1.6213459328950352e-07,
3380
+ "loss": 0.7998,
3381
+ "step": 476
3382
+ },
3383
+ {
3384
+ "epoch": 0.9784615384615385,
3385
+ "grad_norm": 3.0587315559387207,
3386
+ "learning_rate": 1.3624986408924956e-07,
3387
+ "loss": 0.5679,
3388
+ "step": 477
3389
+ },
3390
+ {
3391
+ "epoch": 0.9805128205128205,
3392
+ "grad_norm": 4.51352071762085,
3393
+ "learning_rate": 1.1261207421874309e-07,
3394
+ "loss": 0.7706,
3395
+ "step": 478
3396
+ },
3397
+ {
3398
+ "epoch": 0.9825641025641025,
3399
+ "grad_norm": 3.1249945163726807,
3400
+ "learning_rate": 9.12222888341252e-08,
3401
+ "loss": 0.64,
3402
+ "step": 479
3403
+ },
3404
+ {
3405
+ "epoch": 0.9846153846153847,
3406
+ "grad_norm": 3.75665020942688,
3407
+ "learning_rate": 7.208147179291192e-08,
3408
+ "loss": 0.9884,
3409
+ "step": 480
3410
+ },
3411
+ {
3412
+ "epoch": 0.9866666666666667,
3413
+ "grad_norm": 3.27207088470459,
3414
+ "learning_rate": 5.5190485610534525e-08,
3415
+ "loss": 0.4883,
3416
+ "step": 481
3417
+ },
3418
+ {
3419
+ "epoch": 0.9887179487179487,
3420
+ "grad_norm": 4.946873664855957,
3421
+ "learning_rate": 4.055009142152067e-08,
3422
+ "loss": 0.8809,
3423
+ "step": 482
3424
+ },
3425
+ {
3426
+ "epoch": 0.9907692307692307,
3427
+ "grad_norm": 3.621314287185669,
3428
+ "learning_rate": 2.8160948945138434e-08,
3429
+ "loss": 0.8451,
3430
+ "step": 483
3431
+ },
3432
+ {
3433
+ "epoch": 0.9928205128205129,
3434
+ "grad_norm": 3.5091235637664795,
3435
+ "learning_rate": 1.802361645573125e-08,
3436
+ "loss": 0.7231,
3437
+ "step": 484
3438
+ },
3439
+ {
3440
+ "epoch": 0.9948717948717949,
3441
+ "grad_norm": 3.9179797172546387,
3442
+ "learning_rate": 1.0138550757493592e-08,
3443
+ "loss": 0.6897,
3444
+ "step": 485
3445
+ },
3446
+ {
3447
+ "epoch": 0.9969230769230769,
3448
+ "grad_norm": 5.395089149475098,
3449
+ "learning_rate": 4.506107163948503e-09,
3450
+ "loss": 1.0574,
3451
+ "step": 486
3452
+ },
3453
+ {
3454
+ "epoch": 0.9989743589743589,
3455
+ "grad_norm": 6.939630508422852,
3456
+ "learning_rate": 1.1265394818993358e-09,
3457
+ "loss": 1.0886,
3458
+ "step": 487
3459
+ },
3460
+ {
3461
+ "epoch": 1.001025641025641,
3462
+ "grad_norm": 24.829050064086914,
3463
+ "learning_rate": 0.0,
3464
+ "loss": 3.5024,
3465
+ "step": 488
3466
  }
3467
  ],
3468
  "logging_steps": 1,
 
3486
  "should_evaluate": false,
3487
  "should_log": false,
3488
  "should_save": true,
3489
+ "should_training_stop": true
3490
  },
3491
  "attributes": {}
3492
  }
3493
  },
3494
+ "total_flos": 3.771257911640064e+16,
3495
  "train_batch_size": 8,
3496
  "trial_name": null,
3497
  "trial_params": null