Vishal24 commited on
Commit
d0977bd
·
verified ·
1 Parent(s): 83ce926

Upload checkpoint-387090

Browse files
Files changed (5) hide show
  1. optimizer.pt +1 -1
  2. rng_state.pth +2 -2
  3. scheduler.pt +1 -1
  4. trainer_state.json +1891 -5
  5. training_args.bin +1 -1
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf389c91b851e668d5e63f6f7c5b36c73e641d8e4b093e49b672a0d77b0c5abe
3
  size 866895354
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da49aa28746ed8097072a44404d04cda00ada0bf98ca46ae77a286f9d797ae58
3
  size 866895354
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fc23f9a9f7aa172955396035c69940d79883c5359b47c42257084c82f32f20ed
3
- size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0ff4e93208fe1783462c5ae204a4cc834ea3fc02e3256b2aae94f996b93a622
3
+ size 14180
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:52acb6d543ff5fdb99c731bea89a5d2499fee1cdf9577497042d302270267fa2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c53cdb4a080fe565430cda0918d011de3751759b213ac69153c506f7adfadce6
3
  size 1064
trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 20.0,
5
  "eval_steps": 500,
6
- "global_step": 258060,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3779,12 +3779,1898 @@
3779
  "eval_samples_per_second": 764.036,
3780
  "eval_steps_per_second": 11.942,
3781
  "step": 258060
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3782
  }
3783
  ],
3784
  "logging_steps": 500,
3785
- "max_steps": 258060,
3786
  "num_input_tokens_seen": 0,
3787
- "num_train_epochs": 20,
3788
  "save_steps": 500,
3789
  "stateful_callbacks": {
3790
  "TrainerControl": {
@@ -3798,7 +5684,7 @@
3798
  "attributes": {}
3799
  }
3800
  },
3801
- "total_flos": 6.469828059827256e+17,
3802
  "train_batch_size": 64,
3803
  "trial_name": null,
3804
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 30.0,
5
  "eval_steps": 500,
6
+ "global_step": 387090,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3779
  "eval_samples_per_second": 764.036,
3780
  "eval_steps_per_second": 11.942,
3781
  "step": 258060
3782
+ },
3783
+ {
3784
+ "epoch": 20.034100596760442,
3785
+ "grad_norm": 6.319613456726074,
3786
+ "learning_rate": 6.643932935493038e-06,
3787
+ "loss": 2.2464,
3788
+ "step": 258500
3789
+ },
3790
+ {
3791
+ "epoch": 20.07285127489731,
3792
+ "grad_norm": 10.947772026062012,
3793
+ "learning_rate": 6.61809915006846e-06,
3794
+ "loss": 2.2717,
3795
+ "step": 259000
3796
+ },
3797
+ {
3798
+ "epoch": 20.111601953034178,
3799
+ "grad_norm": 6.688451290130615,
3800
+ "learning_rate": 6.592265364643882e-06,
3801
+ "loss": 2.246,
3802
+ "step": 259500
3803
+ },
3804
+ {
3805
+ "epoch": 20.150352631171046,
3806
+ "grad_norm": 7.084783554077148,
3807
+ "learning_rate": 6.566431579219303e-06,
3808
+ "loss": 2.2547,
3809
+ "step": 260000
3810
+ },
3811
+ {
3812
+ "epoch": 20.189103309307914,
3813
+ "grad_norm": 7.182523250579834,
3814
+ "learning_rate": 6.540597793794725e-06,
3815
+ "loss": 2.2673,
3816
+ "step": 260500
3817
+ },
3818
+ {
3819
+ "epoch": 20.22785398744478,
3820
+ "grad_norm": 6.572226524353027,
3821
+ "learning_rate": 6.514764008370147e-06,
3822
+ "loss": 2.2696,
3823
+ "step": 261000
3824
+ },
3825
+ {
3826
+ "epoch": 20.26660466558165,
3827
+ "grad_norm": 6.861509323120117,
3828
+ "learning_rate": 6.488930222945569e-06,
3829
+ "loss": 2.2602,
3830
+ "step": 261500
3831
+ },
3832
+ {
3833
+ "epoch": 20.305355343718514,
3834
+ "grad_norm": 7.068969249725342,
3835
+ "learning_rate": 6.46309643752099e-06,
3836
+ "loss": 2.2736,
3837
+ "step": 262000
3838
+ },
3839
+ {
3840
+ "epoch": 20.34410602185538,
3841
+ "grad_norm": 6.5293660163879395,
3842
+ "learning_rate": 6.4372626520964125e-06,
3843
+ "loss": 2.2698,
3844
+ "step": 262500
3845
+ },
3846
+ {
3847
+ "epoch": 20.38285669999225,
3848
+ "grad_norm": 6.285311698913574,
3849
+ "learning_rate": 6.411428866671834e-06,
3850
+ "loss": 2.2671,
3851
+ "step": 263000
3852
+ },
3853
+ {
3854
+ "epoch": 20.421607378129117,
3855
+ "grad_norm": 6.466723918914795,
3856
+ "learning_rate": 6.3855950812472554e-06,
3857
+ "loss": 2.267,
3858
+ "step": 263500
3859
+ },
3860
+ {
3861
+ "epoch": 20.460358056265985,
3862
+ "grad_norm": 7.045479774475098,
3863
+ "learning_rate": 6.359761295822677e-06,
3864
+ "loss": 2.2499,
3865
+ "step": 264000
3866
+ },
3867
+ {
3868
+ "epoch": 20.499108734402853,
3869
+ "grad_norm": 7.05580472946167,
3870
+ "learning_rate": 6.333927510398099e-06,
3871
+ "loss": 2.2793,
3872
+ "step": 264500
3873
+ },
3874
+ {
3875
+ "epoch": 20.53785941253972,
3876
+ "grad_norm": 7.213685035705566,
3877
+ "learning_rate": 6.308093724973521e-06,
3878
+ "loss": 2.2519,
3879
+ "step": 265000
3880
+ },
3881
+ {
3882
+ "epoch": 20.57661009067659,
3883
+ "grad_norm": 6.6378984451293945,
3884
+ "learning_rate": 6.282259939548942e-06,
3885
+ "loss": 2.2699,
3886
+ "step": 265500
3887
+ },
3888
+ {
3889
+ "epoch": 20.615360768813453,
3890
+ "grad_norm": 6.8442463874816895,
3891
+ "learning_rate": 6.2564261541243645e-06,
3892
+ "loss": 2.2697,
3893
+ "step": 266000
3894
+ },
3895
+ {
3896
+ "epoch": 20.65411144695032,
3897
+ "grad_norm": 7.099138259887695,
3898
+ "learning_rate": 6.230592368699786e-06,
3899
+ "loss": 2.2622,
3900
+ "step": 266500
3901
+ },
3902
+ {
3903
+ "epoch": 20.69286212508719,
3904
+ "grad_norm": 6.572378635406494,
3905
+ "learning_rate": 6.2047585832752074e-06,
3906
+ "loss": 2.2709,
3907
+ "step": 267000
3908
+ },
3909
+ {
3910
+ "epoch": 20.731612803224056,
3911
+ "grad_norm": 6.410079479217529,
3912
+ "learning_rate": 6.17892479785063e-06,
3913
+ "loss": 2.2599,
3914
+ "step": 267500
3915
+ },
3916
+ {
3917
+ "epoch": 20.770363481360924,
3918
+ "grad_norm": 7.154236316680908,
3919
+ "learning_rate": 6.153091012426051e-06,
3920
+ "loss": 2.2654,
3921
+ "step": 268000
3922
+ },
3923
+ {
3924
+ "epoch": 20.809114159497792,
3925
+ "grad_norm": 7.05757999420166,
3926
+ "learning_rate": 6.127257227001473e-06,
3927
+ "loss": 2.2673,
3928
+ "step": 268500
3929
+ },
3930
+ {
3931
+ "epoch": 20.84786483763466,
3932
+ "grad_norm": 7.457660675048828,
3933
+ "learning_rate": 6.101423441576895e-06,
3934
+ "loss": 2.2534,
3935
+ "step": 269000
3936
+ },
3937
+ {
3938
+ "epoch": 20.886615515771528,
3939
+ "grad_norm": 6.697342872619629,
3940
+ "learning_rate": 6.0755896561523165e-06,
3941
+ "loss": 2.2721,
3942
+ "step": 269500
3943
+ },
3944
+ {
3945
+ "epoch": 20.925366193908392,
3946
+ "grad_norm": 6.83280611038208,
3947
+ "learning_rate": 6.049755870727738e-06,
3948
+ "loss": 2.276,
3949
+ "step": 270000
3950
+ },
3951
+ {
3952
+ "epoch": 20.96411687204526,
3953
+ "grad_norm": 6.298649311065674,
3954
+ "learning_rate": 6.02392208530316e-06,
3955
+ "loss": 2.265,
3956
+ "step": 270500
3957
+ },
3958
+ {
3959
+ "epoch": 21.0,
3960
+ "eval_loss": 2.2148919105529785,
3961
+ "eval_runtime": 275.6952,
3962
+ "eval_samples_per_second": 748.889,
3963
+ "eval_steps_per_second": 11.705,
3964
+ "step": 270963
3965
+ },
3966
+ {
3967
+ "epoch": 21.002867550182128,
3968
+ "grad_norm": 6.698548316955566,
3969
+ "learning_rate": 5.998088299878582e-06,
3970
+ "loss": 2.2576,
3971
+ "step": 271000
3972
+ },
3973
+ {
3974
+ "epoch": 21.041618228318995,
3975
+ "grad_norm": 6.784346103668213,
3976
+ "learning_rate": 5.972254514454003e-06,
3977
+ "loss": 2.2398,
3978
+ "step": 271500
3979
+ },
3980
+ {
3981
+ "epoch": 21.080368906455863,
3982
+ "grad_norm": 7.072300910949707,
3983
+ "learning_rate": 5.946420729029425e-06,
3984
+ "loss": 2.2557,
3985
+ "step": 272000
3986
+ },
3987
+ {
3988
+ "epoch": 21.11911958459273,
3989
+ "grad_norm": 6.624369144439697,
3990
+ "learning_rate": 5.920586943604847e-06,
3991
+ "loss": 2.2337,
3992
+ "step": 272500
3993
+ },
3994
+ {
3995
+ "epoch": 21.1578702627296,
3996
+ "grad_norm": 6.317164897918701,
3997
+ "learning_rate": 5.8947531581802685e-06,
3998
+ "loss": 2.2534,
3999
+ "step": 273000
4000
+ },
4001
+ {
4002
+ "epoch": 21.196620940866467,
4003
+ "grad_norm": 6.728669166564941,
4004
+ "learning_rate": 5.86891937275569e-06,
4005
+ "loss": 2.2505,
4006
+ "step": 273500
4007
+ },
4008
+ {
4009
+ "epoch": 21.23537161900333,
4010
+ "grad_norm": 6.596154689788818,
4011
+ "learning_rate": 5.843085587331112e-06,
4012
+ "loss": 2.253,
4013
+ "step": 274000
4014
+ },
4015
+ {
4016
+ "epoch": 21.2741222971402,
4017
+ "grad_norm": 6.471163749694824,
4018
+ "learning_rate": 5.817251801906534e-06,
4019
+ "loss": 2.2556,
4020
+ "step": 274500
4021
+ },
4022
+ {
4023
+ "epoch": 21.312872975277067,
4024
+ "grad_norm": 6.29288911819458,
4025
+ "learning_rate": 5.791418016481955e-06,
4026
+ "loss": 2.2567,
4027
+ "step": 275000
4028
+ },
4029
+ {
4030
+ "epoch": 21.351623653413935,
4031
+ "grad_norm": 7.078927040100098,
4032
+ "learning_rate": 5.7655842310573776e-06,
4033
+ "loss": 2.2294,
4034
+ "step": 275500
4035
+ },
4036
+ {
4037
+ "epoch": 21.390374331550802,
4038
+ "grad_norm": 6.867557525634766,
4039
+ "learning_rate": 5.739750445632799e-06,
4040
+ "loss": 2.2574,
4041
+ "step": 276000
4042
+ },
4043
+ {
4044
+ "epoch": 21.42912500968767,
4045
+ "grad_norm": 6.830238342285156,
4046
+ "learning_rate": 5.7139166602082205e-06,
4047
+ "loss": 2.2794,
4048
+ "step": 276500
4049
+ },
4050
+ {
4051
+ "epoch": 21.467875687824538,
4052
+ "grad_norm": 6.694831371307373,
4053
+ "learning_rate": 5.688082874783643e-06,
4054
+ "loss": 2.253,
4055
+ "step": 277000
4056
+ },
4057
+ {
4058
+ "epoch": 21.506626365961406,
4059
+ "grad_norm": 7.064994812011719,
4060
+ "learning_rate": 5.662249089359064e-06,
4061
+ "loss": 2.2435,
4062
+ "step": 277500
4063
+ },
4064
+ {
4065
+ "epoch": 21.54537704409827,
4066
+ "grad_norm": 6.832572937011719,
4067
+ "learning_rate": 5.636415303934486e-06,
4068
+ "loss": 2.2478,
4069
+ "step": 278000
4070
+ },
4071
+ {
4072
+ "epoch": 21.584127722235138,
4073
+ "grad_norm": 7.045238494873047,
4074
+ "learning_rate": 5.610581518509908e-06,
4075
+ "loss": 2.2434,
4076
+ "step": 278500
4077
+ },
4078
+ {
4079
+ "epoch": 21.622878400372006,
4080
+ "grad_norm": 6.720279216766357,
4081
+ "learning_rate": 5.58474773308533e-06,
4082
+ "loss": 2.238,
4083
+ "step": 279000
4084
+ },
4085
+ {
4086
+ "epoch": 21.661629078508874,
4087
+ "grad_norm": 7.401440143585205,
4088
+ "learning_rate": 5.558913947660751e-06,
4089
+ "loss": 2.2461,
4090
+ "step": 279500
4091
+ },
4092
+ {
4093
+ "epoch": 21.70037975664574,
4094
+ "grad_norm": 6.497147560119629,
4095
+ "learning_rate": 5.5330801622361725e-06,
4096
+ "loss": 2.2339,
4097
+ "step": 280000
4098
+ },
4099
+ {
4100
+ "epoch": 21.73913043478261,
4101
+ "grad_norm": 6.529776096343994,
4102
+ "learning_rate": 5.507246376811595e-06,
4103
+ "loss": 2.2501,
4104
+ "step": 280500
4105
+ },
4106
+ {
4107
+ "epoch": 21.777881112919477,
4108
+ "grad_norm": 6.42600679397583,
4109
+ "learning_rate": 5.481412591387016e-06,
4110
+ "loss": 2.235,
4111
+ "step": 281000
4112
+ },
4113
+ {
4114
+ "epoch": 21.816631791056345,
4115
+ "grad_norm": 6.715229034423828,
4116
+ "learning_rate": 5.455578805962438e-06,
4117
+ "loss": 2.2401,
4118
+ "step": 281500
4119
+ },
4120
+ {
4121
+ "epoch": 21.85538246919321,
4122
+ "grad_norm": 6.575899124145508,
4123
+ "learning_rate": 5.42974502053786e-06,
4124
+ "loss": 2.2576,
4125
+ "step": 282000
4126
+ },
4127
+ {
4128
+ "epoch": 21.894133147330077,
4129
+ "grad_norm": 5.999971866607666,
4130
+ "learning_rate": 5.403911235113282e-06,
4131
+ "loss": 2.2379,
4132
+ "step": 282500
4133
+ },
4134
+ {
4135
+ "epoch": 21.932883825466945,
4136
+ "grad_norm": 6.936278343200684,
4137
+ "learning_rate": 5.378077449688703e-06,
4138
+ "loss": 2.2534,
4139
+ "step": 283000
4140
+ },
4141
+ {
4142
+ "epoch": 21.971634503603813,
4143
+ "grad_norm": 6.040930271148682,
4144
+ "learning_rate": 5.352243664264125e-06,
4145
+ "loss": 2.2391,
4146
+ "step": 283500
4147
+ },
4148
+ {
4149
+ "epoch": 22.0,
4150
+ "eval_loss": 2.1943371295928955,
4151
+ "eval_runtime": 268.1318,
4152
+ "eval_samples_per_second": 770.013,
4153
+ "eval_steps_per_second": 12.035,
4154
+ "step": 283866
4155
+ },
4156
+ {
4157
+ "epoch": 22.01038518174068,
4158
+ "grad_norm": 6.7548747062683105,
4159
+ "learning_rate": 5.326409878839547e-06,
4160
+ "loss": 2.2428,
4161
+ "step": 284000
4162
+ },
4163
+ {
4164
+ "epoch": 22.04913585987755,
4165
+ "grad_norm": 7.0850749015808105,
4166
+ "learning_rate": 5.300576093414968e-06,
4167
+ "loss": 2.2273,
4168
+ "step": 284500
4169
+ },
4170
+ {
4171
+ "epoch": 22.087886538014416,
4172
+ "grad_norm": 6.658077239990234,
4173
+ "learning_rate": 5.274742307990391e-06,
4174
+ "loss": 2.2214,
4175
+ "step": 285000
4176
+ },
4177
+ {
4178
+ "epoch": 22.126637216151284,
4179
+ "grad_norm": 7.19653844833374,
4180
+ "learning_rate": 5.248908522565812e-06,
4181
+ "loss": 2.2273,
4182
+ "step": 285500
4183
+ },
4184
+ {
4185
+ "epoch": 22.16538789428815,
4186
+ "grad_norm": 7.094461441040039,
4187
+ "learning_rate": 5.223074737141234e-06,
4188
+ "loss": 2.2359,
4189
+ "step": 286000
4190
+ },
4191
+ {
4192
+ "epoch": 22.204138572425016,
4193
+ "grad_norm": 7.156402587890625,
4194
+ "learning_rate": 5.197240951716656e-06,
4195
+ "loss": 2.1969,
4196
+ "step": 286500
4197
+ },
4198
+ {
4199
+ "epoch": 22.242889250561884,
4200
+ "grad_norm": 6.595995903015137,
4201
+ "learning_rate": 5.171407166292077e-06,
4202
+ "loss": 2.2223,
4203
+ "step": 287000
4204
+ },
4205
+ {
4206
+ "epoch": 22.281639928698752,
4207
+ "grad_norm": 7.04496955871582,
4208
+ "learning_rate": 5.145573380867499e-06,
4209
+ "loss": 2.2343,
4210
+ "step": 287500
4211
+ },
4212
+ {
4213
+ "epoch": 22.32039060683562,
4214
+ "grad_norm": 7.146208763122559,
4215
+ "learning_rate": 5.11973959544292e-06,
4216
+ "loss": 2.2338,
4217
+ "step": 288000
4218
+ },
4219
+ {
4220
+ "epoch": 22.359141284972488,
4221
+ "grad_norm": 6.4659576416015625,
4222
+ "learning_rate": 5.093905810018343e-06,
4223
+ "loss": 2.2273,
4224
+ "step": 288500
4225
+ },
4226
+ {
4227
+ "epoch": 22.397891963109355,
4228
+ "grad_norm": 6.372287750244141,
4229
+ "learning_rate": 5.068072024593764e-06,
4230
+ "loss": 2.2247,
4231
+ "step": 289000
4232
+ },
4233
+ {
4234
+ "epoch": 22.436642641246223,
4235
+ "grad_norm": 7.088085174560547,
4236
+ "learning_rate": 5.042238239169186e-06,
4237
+ "loss": 2.2474,
4238
+ "step": 289500
4239
+ },
4240
+ {
4241
+ "epoch": 22.475393319383087,
4242
+ "grad_norm": 6.911520004272461,
4243
+ "learning_rate": 5.016404453744608e-06,
4244
+ "loss": 2.2356,
4245
+ "step": 290000
4246
+ },
4247
+ {
4248
+ "epoch": 22.514143997519955,
4249
+ "grad_norm": 7.5756611824035645,
4250
+ "learning_rate": 4.990570668320029e-06,
4251
+ "loss": 2.2297,
4252
+ "step": 290500
4253
+ },
4254
+ {
4255
+ "epoch": 22.552894675656823,
4256
+ "grad_norm": 6.587701320648193,
4257
+ "learning_rate": 4.964736882895451e-06,
4258
+ "loss": 2.2245,
4259
+ "step": 291000
4260
+ },
4261
+ {
4262
+ "epoch": 22.59164535379369,
4263
+ "grad_norm": 5.8870849609375,
4264
+ "learning_rate": 4.938903097470873e-06,
4265
+ "loss": 2.229,
4266
+ "step": 291500
4267
+ },
4268
+ {
4269
+ "epoch": 22.63039603193056,
4270
+ "grad_norm": 6.882173538208008,
4271
+ "learning_rate": 4.913069312046295e-06,
4272
+ "loss": 2.2254,
4273
+ "step": 292000
4274
+ },
4275
+ {
4276
+ "epoch": 22.669146710067427,
4277
+ "grad_norm": 6.710127830505371,
4278
+ "learning_rate": 4.887235526621716e-06,
4279
+ "loss": 2.223,
4280
+ "step": 292500
4281
+ },
4282
+ {
4283
+ "epoch": 22.707897388204294,
4284
+ "grad_norm": 6.753304481506348,
4285
+ "learning_rate": 4.8614017411971385e-06,
4286
+ "loss": 2.2311,
4287
+ "step": 293000
4288
+ },
4289
+ {
4290
+ "epoch": 22.746648066341162,
4291
+ "grad_norm": 6.02184534072876,
4292
+ "learning_rate": 4.83556795577256e-06,
4293
+ "loss": 2.2198,
4294
+ "step": 293500
4295
+ },
4296
+ {
4297
+ "epoch": 22.78539874447803,
4298
+ "grad_norm": 7.022054195404053,
4299
+ "learning_rate": 4.809734170347981e-06,
4300
+ "loss": 2.2275,
4301
+ "step": 294000
4302
+ },
4303
+ {
4304
+ "epoch": 22.824149422614894,
4305
+ "grad_norm": 7.951735019683838,
4306
+ "learning_rate": 4.783900384923404e-06,
4307
+ "loss": 2.2314,
4308
+ "step": 294500
4309
+ },
4310
+ {
4311
+ "epoch": 22.862900100751762,
4312
+ "grad_norm": 5.854333877563477,
4313
+ "learning_rate": 4.758066599498825e-06,
4314
+ "loss": 2.2172,
4315
+ "step": 295000
4316
+ },
4317
+ {
4318
+ "epoch": 22.90165077888863,
4319
+ "grad_norm": 6.547132968902588,
4320
+ "learning_rate": 4.732232814074247e-06,
4321
+ "loss": 2.226,
4322
+ "step": 295500
4323
+ },
4324
+ {
4325
+ "epoch": 22.940401457025498,
4326
+ "grad_norm": 6.535789966583252,
4327
+ "learning_rate": 4.706399028649668e-06,
4328
+ "loss": 2.2299,
4329
+ "step": 296000
4330
+ },
4331
+ {
4332
+ "epoch": 22.979152135162366,
4333
+ "grad_norm": 7.285912036895752,
4334
+ "learning_rate": 4.6805652432250905e-06,
4335
+ "loss": 2.2239,
4336
+ "step": 296500
4337
+ },
4338
+ {
4339
+ "epoch": 23.0,
4340
+ "eval_loss": 2.185373067855835,
4341
+ "eval_runtime": 265.4806,
4342
+ "eval_samples_per_second": 777.703,
4343
+ "eval_steps_per_second": 12.155,
4344
+ "step": 296769
4345
+ },
4346
+ {
4347
+ "epoch": 23.017902813299234,
4348
+ "grad_norm": 6.972716808319092,
4349
+ "learning_rate": 4.654731457800512e-06,
4350
+ "loss": 2.2193,
4351
+ "step": 297000
4352
+ },
4353
+ {
4354
+ "epoch": 23.0566534914361,
4355
+ "grad_norm": 6.841848373413086,
4356
+ "learning_rate": 4.628897672375933e-06,
4357
+ "loss": 2.2139,
4358
+ "step": 297500
4359
+ },
4360
+ {
4361
+ "epoch": 23.09540416957297,
4362
+ "grad_norm": 6.285813331604004,
4363
+ "learning_rate": 4.603063886951356e-06,
4364
+ "loss": 2.2092,
4365
+ "step": 298000
4366
+ },
4367
+ {
4368
+ "epoch": 23.134154847709834,
4369
+ "grad_norm": 6.615530967712402,
4370
+ "learning_rate": 4.577230101526777e-06,
4371
+ "loss": 2.2141,
4372
+ "step": 298500
4373
+ },
4374
+ {
4375
+ "epoch": 23.1729055258467,
4376
+ "grad_norm": 6.762087821960449,
4377
+ "learning_rate": 4.551396316102199e-06,
4378
+ "loss": 2.1944,
4379
+ "step": 299000
4380
+ },
4381
+ {
4382
+ "epoch": 23.21165620398357,
4383
+ "grad_norm": 7.053805351257324,
4384
+ "learning_rate": 4.525562530677621e-06,
4385
+ "loss": 2.2129,
4386
+ "step": 299500
4387
+ },
4388
+ {
4389
+ "epoch": 23.250406882120437,
4390
+ "grad_norm": 7.14516544342041,
4391
+ "learning_rate": 4.4997287452530425e-06,
4392
+ "loss": 2.2038,
4393
+ "step": 300000
4394
+ },
4395
+ {
4396
+ "epoch": 23.289157560257305,
4397
+ "grad_norm": 6.8478803634643555,
4398
+ "learning_rate": 4.473894959828464e-06,
4399
+ "loss": 2.2166,
4400
+ "step": 300500
4401
+ },
4402
+ {
4403
+ "epoch": 23.327908238394173,
4404
+ "grad_norm": 6.808053970336914,
4405
+ "learning_rate": 4.448061174403886e-06,
4406
+ "loss": 2.224,
4407
+ "step": 301000
4408
+ },
4409
+ {
4410
+ "epoch": 23.36665891653104,
4411
+ "grad_norm": 7.149857521057129,
4412
+ "learning_rate": 4.422227388979308e-06,
4413
+ "loss": 2.2081,
4414
+ "step": 301500
4415
+ },
4416
+ {
4417
+ "epoch": 23.40540959466791,
4418
+ "grad_norm": 6.334920406341553,
4419
+ "learning_rate": 4.396393603554729e-06,
4420
+ "loss": 2.2217,
4421
+ "step": 302000
4422
+ },
4423
+ {
4424
+ "epoch": 23.444160272804773,
4425
+ "grad_norm": 7.154323577880859,
4426
+ "learning_rate": 4.3705598181301515e-06,
4427
+ "loss": 2.2129,
4428
+ "step": 302500
4429
+ },
4430
+ {
4431
+ "epoch": 23.48291095094164,
4432
+ "grad_norm": 7.202456474304199,
4433
+ "learning_rate": 4.344726032705573e-06,
4434
+ "loss": 2.2019,
4435
+ "step": 303000
4436
+ },
4437
+ {
4438
+ "epoch": 23.52166162907851,
4439
+ "grad_norm": 6.832441806793213,
4440
+ "learning_rate": 4.3188922472809945e-06,
4441
+ "loss": 2.214,
4442
+ "step": 303500
4443
+ },
4444
+ {
4445
+ "epoch": 23.560412307215376,
4446
+ "grad_norm": 6.258272647857666,
4447
+ "learning_rate": 4.293058461856417e-06,
4448
+ "loss": 2.21,
4449
+ "step": 304000
4450
+ },
4451
+ {
4452
+ "epoch": 23.599162985352244,
4453
+ "grad_norm": 6.8391194343566895,
4454
+ "learning_rate": 4.267224676431838e-06,
4455
+ "loss": 2.2106,
4456
+ "step": 304500
4457
+ },
4458
+ {
4459
+ "epoch": 23.637913663489112,
4460
+ "grad_norm": 6.621433734893799,
4461
+ "learning_rate": 4.24139089100726e-06,
4462
+ "loss": 2.2219,
4463
+ "step": 305000
4464
+ },
4465
+ {
4466
+ "epoch": 23.67666434162598,
4467
+ "grad_norm": 6.718801498413086,
4468
+ "learning_rate": 4.215557105582681e-06,
4469
+ "loss": 2.2215,
4470
+ "step": 305500
4471
+ },
4472
+ {
4473
+ "epoch": 23.715415019762847,
4474
+ "grad_norm": 7.0543622970581055,
4475
+ "learning_rate": 4.1897233201581036e-06,
4476
+ "loss": 2.2182,
4477
+ "step": 306000
4478
+ },
4479
+ {
4480
+ "epoch": 23.75416569789971,
4481
+ "grad_norm": 7.598169326782227,
4482
+ "learning_rate": 4.163889534733525e-06,
4483
+ "loss": 2.2218,
4484
+ "step": 306500
4485
+ },
4486
+ {
4487
+ "epoch": 23.79291637603658,
4488
+ "grad_norm": 6.874271392822266,
4489
+ "learning_rate": 4.1380557493089465e-06,
4490
+ "loss": 2.2061,
4491
+ "step": 307000
4492
+ },
4493
+ {
4494
+ "epoch": 23.831667054173447,
4495
+ "grad_norm": 6.820863723754883,
4496
+ "learning_rate": 4.112221963884369e-06,
4497
+ "loss": 2.2166,
4498
+ "step": 307500
4499
+ },
4500
+ {
4501
+ "epoch": 23.870417732310315,
4502
+ "grad_norm": 7.149729251861572,
4503
+ "learning_rate": 4.08638817845979e-06,
4504
+ "loss": 2.2089,
4505
+ "step": 308000
4506
+ },
4507
+ {
4508
+ "epoch": 23.909168410447183,
4509
+ "grad_norm": 6.278995990753174,
4510
+ "learning_rate": 4.060554393035212e-06,
4511
+ "loss": 2.2163,
4512
+ "step": 308500
4513
+ },
4514
+ {
4515
+ "epoch": 23.94791908858405,
4516
+ "grad_norm": 7.162642002105713,
4517
+ "learning_rate": 4.034720607610634e-06,
4518
+ "loss": 2.2222,
4519
+ "step": 309000
4520
+ },
4521
+ {
4522
+ "epoch": 23.98666976672092,
4523
+ "grad_norm": 6.67965841293335,
4524
+ "learning_rate": 4.0088868221860556e-06,
4525
+ "loss": 2.1965,
4526
+ "step": 309500
4527
+ },
4528
+ {
4529
+ "epoch": 24.0,
4530
+ "eval_loss": 2.1702771186828613,
4531
+ "eval_runtime": 264.5615,
4532
+ "eval_samples_per_second": 780.405,
4533
+ "eval_steps_per_second": 12.198,
4534
+ "step": 309672
4535
+ },
4536
+ {
4537
+ "epoch": 24.025420444857787,
4538
+ "grad_norm": 6.355005264282227,
4539
+ "learning_rate": 3.983053036761477e-06,
4540
+ "loss": 2.1935,
4541
+ "step": 310000
4542
+ },
4543
+ {
4544
+ "epoch": 24.06417112299465,
4545
+ "grad_norm": 6.339087963104248,
4546
+ "learning_rate": 3.957219251336899e-06,
4547
+ "loss": 2.2022,
4548
+ "step": 310500
4549
+ },
4550
+ {
4551
+ "epoch": 24.10292180113152,
4552
+ "grad_norm": 6.386953353881836,
4553
+ "learning_rate": 3.931385465912321e-06,
4554
+ "loss": 2.1941,
4555
+ "step": 311000
4556
+ },
4557
+ {
4558
+ "epoch": 24.141672479268387,
4559
+ "grad_norm": 6.9508376121521,
4560
+ "learning_rate": 3.905551680487742e-06,
4561
+ "loss": 2.1991,
4562
+ "step": 311500
4563
+ },
4564
+ {
4565
+ "epoch": 24.180423157405254,
4566
+ "grad_norm": 7.1515727043151855,
4567
+ "learning_rate": 3.879717895063164e-06,
4568
+ "loss": 2.2118,
4569
+ "step": 312000
4570
+ },
4571
+ {
4572
+ "epoch": 24.219173835542122,
4573
+ "grad_norm": 6.807953357696533,
4574
+ "learning_rate": 3.853884109638585e-06,
4575
+ "loss": 2.2158,
4576
+ "step": 312500
4577
+ },
4578
+ {
4579
+ "epoch": 24.25792451367899,
4580
+ "grad_norm": 7.41762638092041,
4581
+ "learning_rate": 3.828050324214007e-06,
4582
+ "loss": 2.1948,
4583
+ "step": 313000
4584
+ },
4585
+ {
4586
+ "epoch": 24.296675191815858,
4587
+ "grad_norm": 7.462344646453857,
4588
+ "learning_rate": 3.802216538789429e-06,
4589
+ "loss": 2.2061,
4590
+ "step": 313500
4591
+ },
4592
+ {
4593
+ "epoch": 24.335425869952726,
4594
+ "grad_norm": 6.6912384033203125,
4595
+ "learning_rate": 3.7763827533648505e-06,
4596
+ "loss": 2.1932,
4597
+ "step": 314000
4598
+ },
4599
+ {
4600
+ "epoch": 24.37417654808959,
4601
+ "grad_norm": 6.79492712020874,
4602
+ "learning_rate": 3.7505489679402724e-06,
4603
+ "loss": 2.193,
4604
+ "step": 314500
4605
+ },
4606
+ {
4607
+ "epoch": 24.412927226226458,
4608
+ "grad_norm": 6.873208522796631,
4609
+ "learning_rate": 3.724715182515694e-06,
4610
+ "loss": 2.1756,
4611
+ "step": 315000
4612
+ },
4613
+ {
4614
+ "epoch": 24.451677904363326,
4615
+ "grad_norm": 6.520395278930664,
4616
+ "learning_rate": 3.6988813970911158e-06,
4617
+ "loss": 2.2019,
4618
+ "step": 315500
4619
+ },
4620
+ {
4621
+ "epoch": 24.490428582500193,
4622
+ "grad_norm": 7.425100326538086,
4623
+ "learning_rate": 3.6730476116665377e-06,
4624
+ "loss": 2.1933,
4625
+ "step": 316000
4626
+ },
4627
+ {
4628
+ "epoch": 24.52917926063706,
4629
+ "grad_norm": 6.990531921386719,
4630
+ "learning_rate": 3.647213826241959e-06,
4631
+ "loss": 2.1953,
4632
+ "step": 316500
4633
+ },
4634
+ {
4635
+ "epoch": 24.56792993877393,
4636
+ "grad_norm": 6.99529504776001,
4637
+ "learning_rate": 3.621380040817381e-06,
4638
+ "loss": 2.1668,
4639
+ "step": 317000
4640
+ },
4641
+ {
4642
+ "epoch": 24.606680616910797,
4643
+ "grad_norm": 7.046565532684326,
4644
+ "learning_rate": 3.595546255392803e-06,
4645
+ "loss": 2.2185,
4646
+ "step": 317500
4647
+ },
4648
+ {
4649
+ "epoch": 24.645431295047665,
4650
+ "grad_norm": 7.261152744293213,
4651
+ "learning_rate": 3.5697124699682244e-06,
4652
+ "loss": 2.1776,
4653
+ "step": 318000
4654
+ },
4655
+ {
4656
+ "epoch": 24.68418197318453,
4657
+ "grad_norm": 7.088150978088379,
4658
+ "learning_rate": 3.5438786845436463e-06,
4659
+ "loss": 2.1939,
4660
+ "step": 318500
4661
+ },
4662
+ {
4663
+ "epoch": 24.722932651321397,
4664
+ "grad_norm": 7.677366733551025,
4665
+ "learning_rate": 3.518044899119068e-06,
4666
+ "loss": 2.1916,
4667
+ "step": 319000
4668
+ },
4669
+ {
4670
+ "epoch": 24.761683329458265,
4671
+ "grad_norm": 7.108632564544678,
4672
+ "learning_rate": 3.4922111136944897e-06,
4673
+ "loss": 2.1851,
4674
+ "step": 319500
4675
+ },
4676
+ {
4677
+ "epoch": 24.800434007595133,
4678
+ "grad_norm": 7.283915996551514,
4679
+ "learning_rate": 3.4663773282699116e-06,
4680
+ "loss": 2.2015,
4681
+ "step": 320000
4682
+ },
4683
+ {
4684
+ "epoch": 24.839184685732,
4685
+ "grad_norm": 7.392533779144287,
4686
+ "learning_rate": 3.440543542845333e-06,
4687
+ "loss": 2.1915,
4688
+ "step": 320500
4689
+ },
4690
+ {
4691
+ "epoch": 24.87793536386887,
4692
+ "grad_norm": 6.849175453186035,
4693
+ "learning_rate": 3.414709757420755e-06,
4694
+ "loss": 2.1931,
4695
+ "step": 321000
4696
+ },
4697
+ {
4698
+ "epoch": 24.916686042005736,
4699
+ "grad_norm": 6.42083740234375,
4700
+ "learning_rate": 3.388875971996177e-06,
4701
+ "loss": 2.198,
4702
+ "step": 321500
4703
+ },
4704
+ {
4705
+ "epoch": 24.955436720142604,
4706
+ "grad_norm": 6.040030002593994,
4707
+ "learning_rate": 3.3630421865715983e-06,
4708
+ "loss": 2.1802,
4709
+ "step": 322000
4710
+ },
4711
+ {
4712
+ "epoch": 24.99418739827947,
4713
+ "grad_norm": 7.585995674133301,
4714
+ "learning_rate": 3.3372084011470202e-06,
4715
+ "loss": 2.1946,
4716
+ "step": 322500
4717
+ },
4718
+ {
4719
+ "epoch": 25.0,
4720
+ "eval_loss": 2.159193992614746,
4721
+ "eval_runtime": 270.695,
4722
+ "eval_samples_per_second": 762.722,
4723
+ "eval_steps_per_second": 11.921,
4724
+ "step": 322575
4725
+ },
4726
+ {
4727
+ "epoch": 25.032938076416336,
4728
+ "grad_norm": 7.309504985809326,
4729
+ "learning_rate": 3.3113746157224417e-06,
4730
+ "loss": 2.1854,
4731
+ "step": 323000
4732
+ },
4733
+ {
4734
+ "epoch": 25.071688754553204,
4735
+ "grad_norm": 6.450008869171143,
4736
+ "learning_rate": 3.2855408302978636e-06,
4737
+ "loss": 2.1827,
4738
+ "step": 323500
4739
+ },
4740
+ {
4741
+ "epoch": 25.11043943269007,
4742
+ "grad_norm": 6.82379674911499,
4743
+ "learning_rate": 3.2597070448732855e-06,
4744
+ "loss": 2.1838,
4745
+ "step": 324000
4746
+ },
4747
+ {
4748
+ "epoch": 25.14919011082694,
4749
+ "grad_norm": 7.034087657928467,
4750
+ "learning_rate": 3.233873259448707e-06,
4751
+ "loss": 2.1618,
4752
+ "step": 324500
4753
+ },
4754
+ {
4755
+ "epoch": 25.187940788963807,
4756
+ "grad_norm": 7.005911827087402,
4757
+ "learning_rate": 3.208039474024129e-06,
4758
+ "loss": 2.1912,
4759
+ "step": 325000
4760
+ },
4761
+ {
4762
+ "epoch": 25.226691467100675,
4763
+ "grad_norm": 6.7085394859313965,
4764
+ "learning_rate": 3.1822056885995508e-06,
4765
+ "loss": 2.1795,
4766
+ "step": 325500
4767
+ },
4768
+ {
4769
+ "epoch": 25.265442145237543,
4770
+ "grad_norm": 6.773245334625244,
4771
+ "learning_rate": 3.1563719031749722e-06,
4772
+ "loss": 2.1965,
4773
+ "step": 326000
4774
+ },
4775
+ {
4776
+ "epoch": 25.304192823374407,
4777
+ "grad_norm": 6.718632698059082,
4778
+ "learning_rate": 3.130538117750394e-06,
4779
+ "loss": 2.1976,
4780
+ "step": 326500
4781
+ },
4782
+ {
4783
+ "epoch": 25.342943501511275,
4784
+ "grad_norm": 8.191710472106934,
4785
+ "learning_rate": 3.1047043323258156e-06,
4786
+ "loss": 2.1762,
4787
+ "step": 327000
4788
+ },
4789
+ {
4790
+ "epoch": 25.381694179648143,
4791
+ "grad_norm": 7.172983169555664,
4792
+ "learning_rate": 3.0788705469012375e-06,
4793
+ "loss": 2.1703,
4794
+ "step": 327500
4795
+ },
4796
+ {
4797
+ "epoch": 25.42044485778501,
4798
+ "grad_norm": 6.283721446990967,
4799
+ "learning_rate": 3.0530367614766594e-06,
4800
+ "loss": 2.1692,
4801
+ "step": 328000
4802
+ },
4803
+ {
4804
+ "epoch": 25.45919553592188,
4805
+ "grad_norm": 6.850103855133057,
4806
+ "learning_rate": 3.027202976052081e-06,
4807
+ "loss": 2.1914,
4808
+ "step": 328500
4809
+ },
4810
+ {
4811
+ "epoch": 25.497946214058747,
4812
+ "grad_norm": 6.31437873840332,
4813
+ "learning_rate": 3.0013691906275028e-06,
4814
+ "loss": 2.1692,
4815
+ "step": 329000
4816
+ },
4817
+ {
4818
+ "epoch": 25.536696892195614,
4819
+ "grad_norm": 6.947432994842529,
4820
+ "learning_rate": 2.9755354052029247e-06,
4821
+ "loss": 2.1848,
4822
+ "step": 329500
4823
+ },
4824
+ {
4825
+ "epoch": 25.575447570332482,
4826
+ "grad_norm": 6.133412837982178,
4827
+ "learning_rate": 2.949701619778346e-06,
4828
+ "loss": 2.1827,
4829
+ "step": 330000
4830
+ },
4831
+ {
4832
+ "epoch": 25.61419824846935,
4833
+ "grad_norm": 7.019827365875244,
4834
+ "learning_rate": 2.923867834353768e-06,
4835
+ "loss": 2.1654,
4836
+ "step": 330500
4837
+ },
4838
+ {
4839
+ "epoch": 25.652948926606214,
4840
+ "grad_norm": 7.326742172241211,
4841
+ "learning_rate": 2.8980340489291895e-06,
4842
+ "loss": 2.1929,
4843
+ "step": 331000
4844
+ },
4845
+ {
4846
+ "epoch": 25.691699604743082,
4847
+ "grad_norm": 7.231571674346924,
4848
+ "learning_rate": 2.8722002635046114e-06,
4849
+ "loss": 2.1913,
4850
+ "step": 331500
4851
+ },
4852
+ {
4853
+ "epoch": 25.73045028287995,
4854
+ "grad_norm": 7.050189971923828,
4855
+ "learning_rate": 2.8463664780800333e-06,
4856
+ "loss": 2.191,
4857
+ "step": 332000
4858
+ },
4859
+ {
4860
+ "epoch": 25.769200961016818,
4861
+ "grad_norm": 6.654092311859131,
4862
+ "learning_rate": 2.8205326926554548e-06,
4863
+ "loss": 2.1871,
4864
+ "step": 332500
4865
+ },
4866
+ {
4867
+ "epoch": 25.807951639153686,
4868
+ "grad_norm": 7.114500522613525,
4869
+ "learning_rate": 2.7946989072308767e-06,
4870
+ "loss": 2.1842,
4871
+ "step": 333000
4872
+ },
4873
+ {
4874
+ "epoch": 25.846702317290553,
4875
+ "grad_norm": 6.987917900085449,
4876
+ "learning_rate": 2.7688651218062986e-06,
4877
+ "loss": 2.1782,
4878
+ "step": 333500
4879
+ },
4880
+ {
4881
+ "epoch": 25.88545299542742,
4882
+ "grad_norm": 6.479386806488037,
4883
+ "learning_rate": 2.74303133638172e-06,
4884
+ "loss": 2.1904,
4885
+ "step": 334000
4886
+ },
4887
+ {
4888
+ "epoch": 25.924203673564286,
4889
+ "grad_norm": 6.597611904144287,
4890
+ "learning_rate": 2.717197550957142e-06,
4891
+ "loss": 2.1782,
4892
+ "step": 334500
4893
+ },
4894
+ {
4895
+ "epoch": 25.962954351701153,
4896
+ "grad_norm": 7.492031097412109,
4897
+ "learning_rate": 2.6913637655325634e-06,
4898
+ "loss": 2.1976,
4899
+ "step": 335000
4900
+ },
4901
+ {
4902
+ "epoch": 26.0,
4903
+ "eval_loss": 2.144183874130249,
4904
+ "eval_runtime": 268.6578,
4905
+ "eval_samples_per_second": 768.505,
4906
+ "eval_steps_per_second": 12.012,
4907
+ "step": 335478
4908
+ },
4909
+ {
4910
+ "epoch": 26.00170502983802,
4911
+ "grad_norm": 7.5874552726745605,
4912
+ "learning_rate": 2.6655299801079853e-06,
4913
+ "loss": 2.1755,
4914
+ "step": 335500
4915
+ },
4916
+ {
4917
+ "epoch": 26.04045570797489,
4918
+ "grad_norm": 7.499856948852539,
4919
+ "learning_rate": 2.6396961946834072e-06,
4920
+ "loss": 2.1885,
4921
+ "step": 336000
4922
+ },
4923
+ {
4924
+ "epoch": 26.079206386111757,
4925
+ "grad_norm": 7.2821946144104,
4926
+ "learning_rate": 2.6138624092588287e-06,
4927
+ "loss": 2.1782,
4928
+ "step": 336500
4929
+ },
4930
+ {
4931
+ "epoch": 26.117957064248625,
4932
+ "grad_norm": 7.0137834548950195,
4933
+ "learning_rate": 2.5880286238342506e-06,
4934
+ "loss": 2.1688,
4935
+ "step": 337000
4936
+ },
4937
+ {
4938
+ "epoch": 26.156707742385493,
4939
+ "grad_norm": 6.468008518218994,
4940
+ "learning_rate": 2.5621948384096725e-06,
4941
+ "loss": 2.1735,
4942
+ "step": 337500
4943
+ },
4944
+ {
4945
+ "epoch": 26.19545842052236,
4946
+ "grad_norm": 6.922983169555664,
4947
+ "learning_rate": 2.536361052985094e-06,
4948
+ "loss": 2.1643,
4949
+ "step": 338000
4950
+ },
4951
+ {
4952
+ "epoch": 26.23420909865923,
4953
+ "grad_norm": 6.963326454162598,
4954
+ "learning_rate": 2.510527267560516e-06,
4955
+ "loss": 2.1569,
4956
+ "step": 338500
4957
+ },
4958
+ {
4959
+ "epoch": 26.272959776796093,
4960
+ "grad_norm": 6.4791579246521,
4961
+ "learning_rate": 2.4846934821359373e-06,
4962
+ "loss": 2.1816,
4963
+ "step": 339000
4964
+ },
4965
+ {
4966
+ "epoch": 26.31171045493296,
4967
+ "grad_norm": 7.289137840270996,
4968
+ "learning_rate": 2.4588596967113592e-06,
4969
+ "loss": 2.1628,
4970
+ "step": 339500
4971
+ },
4972
+ {
4973
+ "epoch": 26.350461133069828,
4974
+ "grad_norm": 7.020922660827637,
4975
+ "learning_rate": 2.433025911286781e-06,
4976
+ "loss": 2.1608,
4977
+ "step": 340000
4978
+ },
4979
+ {
4980
+ "epoch": 26.389211811206696,
4981
+ "grad_norm": 6.522220134735107,
4982
+ "learning_rate": 2.4071921258622026e-06,
4983
+ "loss": 2.1736,
4984
+ "step": 340500
4985
+ },
4986
+ {
4987
+ "epoch": 26.427962489343564,
4988
+ "grad_norm": 7.149320602416992,
4989
+ "learning_rate": 2.3813583404376245e-06,
4990
+ "loss": 2.1761,
4991
+ "step": 341000
4992
+ },
4993
+ {
4994
+ "epoch": 26.46671316748043,
4995
+ "grad_norm": 7.04742431640625,
4996
+ "learning_rate": 2.3555245550130464e-06,
4997
+ "loss": 2.168,
4998
+ "step": 341500
4999
+ },
5000
+ {
5001
+ "epoch": 26.5054638456173,
5002
+ "grad_norm": 7.135145664215088,
5003
+ "learning_rate": 2.329690769588468e-06,
5004
+ "loss": 2.1928,
5005
+ "step": 342000
5006
+ },
5007
+ {
5008
+ "epoch": 26.544214523754167,
5009
+ "grad_norm": 7.492802619934082,
5010
+ "learning_rate": 2.3038569841638898e-06,
5011
+ "loss": 2.1764,
5012
+ "step": 342500
5013
+ },
5014
+ {
5015
+ "epoch": 26.58296520189103,
5016
+ "grad_norm": 6.618491172790527,
5017
+ "learning_rate": 2.2780231987393112e-06,
5018
+ "loss": 2.1768,
5019
+ "step": 343000
5020
+ },
5021
+ {
5022
+ "epoch": 26.6217158800279,
5023
+ "grad_norm": 6.808167457580566,
5024
+ "learning_rate": 2.252189413314733e-06,
5025
+ "loss": 2.1623,
5026
+ "step": 343500
5027
+ },
5028
+ {
5029
+ "epoch": 26.660466558164767,
5030
+ "grad_norm": 6.65431022644043,
5031
+ "learning_rate": 2.226355627890155e-06,
5032
+ "loss": 2.1658,
5033
+ "step": 344000
5034
+ },
5035
+ {
5036
+ "epoch": 26.699217236301635,
5037
+ "grad_norm": 7.762594699859619,
5038
+ "learning_rate": 2.2005218424655765e-06,
5039
+ "loss": 2.1794,
5040
+ "step": 344500
5041
+ },
5042
+ {
5043
+ "epoch": 26.737967914438503,
5044
+ "grad_norm": 6.6927056312561035,
5045
+ "learning_rate": 2.1746880570409984e-06,
5046
+ "loss": 2.1624,
5047
+ "step": 345000
5048
+ },
5049
+ {
5050
+ "epoch": 26.77671859257537,
5051
+ "grad_norm": 6.606927394866943,
5052
+ "learning_rate": 2.1488542716164203e-06,
5053
+ "loss": 2.1741,
5054
+ "step": 345500
5055
+ },
5056
+ {
5057
+ "epoch": 26.81546927071224,
5058
+ "grad_norm": 6.104671955108643,
5059
+ "learning_rate": 2.1230204861918418e-06,
5060
+ "loss": 2.1716,
5061
+ "step": 346000
5062
+ },
5063
+ {
5064
+ "epoch": 26.854219948849106,
5065
+ "grad_norm": 5.965663433074951,
5066
+ "learning_rate": 2.0971867007672637e-06,
5067
+ "loss": 2.1674,
5068
+ "step": 346500
5069
+ },
5070
+ {
5071
+ "epoch": 26.89297062698597,
5072
+ "grad_norm": 6.041355133056641,
5073
+ "learning_rate": 2.071352915342685e-06,
5074
+ "loss": 2.181,
5075
+ "step": 347000
5076
+ },
5077
+ {
5078
+ "epoch": 26.93172130512284,
5079
+ "grad_norm": 7.279519557952881,
5080
+ "learning_rate": 2.045519129918107e-06,
5081
+ "loss": 2.1661,
5082
+ "step": 347500
5083
+ },
5084
+ {
5085
+ "epoch": 26.970471983259706,
5086
+ "grad_norm": 6.790727615356445,
5087
+ "learning_rate": 2.019685344493529e-06,
5088
+ "loss": 2.1658,
5089
+ "step": 348000
5090
+ },
5091
+ {
5092
+ "epoch": 27.0,
5093
+ "eval_loss": 2.137254238128662,
5094
+ "eval_runtime": 268.8992,
5095
+ "eval_samples_per_second": 767.815,
5096
+ "eval_steps_per_second": 12.001,
5097
+ "step": 348381
5098
+ },
5099
+ {
5100
+ "epoch": 27.009222661396574,
5101
+ "grad_norm": 6.905515193939209,
5102
+ "learning_rate": 1.9938515590689504e-06,
5103
+ "loss": 2.1569,
5104
+ "step": 348500
5105
+ },
5106
+ {
5107
+ "epoch": 27.047973339533442,
5108
+ "grad_norm": 6.515853404998779,
5109
+ "learning_rate": 1.9680177736443723e-06,
5110
+ "loss": 2.1713,
5111
+ "step": 349000
5112
+ },
5113
+ {
5114
+ "epoch": 27.08672401767031,
5115
+ "grad_norm": 6.981870651245117,
5116
+ "learning_rate": 1.942183988219794e-06,
5117
+ "loss": 2.1745,
5118
+ "step": 349500
5119
+ },
5120
+ {
5121
+ "epoch": 27.125474695807178,
5122
+ "grad_norm": 6.35358190536499,
5123
+ "learning_rate": 1.9163502027952157e-06,
5124
+ "loss": 2.1644,
5125
+ "step": 350000
5126
+ },
5127
+ {
5128
+ "epoch": 27.164225373944046,
5129
+ "grad_norm": 7.149428844451904,
5130
+ "learning_rate": 1.8905164173706376e-06,
5131
+ "loss": 2.1816,
5132
+ "step": 350500
5133
+ },
5134
+ {
5135
+ "epoch": 27.20297605208091,
5136
+ "grad_norm": 7.136536121368408,
5137
+ "learning_rate": 1.8646826319460593e-06,
5138
+ "loss": 2.1562,
5139
+ "step": 351000
5140
+ },
5141
+ {
5142
+ "epoch": 27.241726730217778,
5143
+ "grad_norm": 6.473196506500244,
5144
+ "learning_rate": 1.838848846521481e-06,
5145
+ "loss": 2.167,
5146
+ "step": 351500
5147
+ },
5148
+ {
5149
+ "epoch": 27.280477408354646,
5150
+ "grad_norm": 6.8429694175720215,
5151
+ "learning_rate": 1.8130150610969026e-06,
5152
+ "loss": 2.1587,
5153
+ "step": 352000
5154
+ },
5155
+ {
5156
+ "epoch": 27.319228086491513,
5157
+ "grad_norm": 6.667392253875732,
5158
+ "learning_rate": 1.7871812756723245e-06,
5159
+ "loss": 2.1575,
5160
+ "step": 352500
5161
+ },
5162
+ {
5163
+ "epoch": 27.35797876462838,
5164
+ "grad_norm": 7.551825046539307,
5165
+ "learning_rate": 1.7613474902477462e-06,
5166
+ "loss": 2.1567,
5167
+ "step": 353000
5168
+ },
5169
+ {
5170
+ "epoch": 27.39672944276525,
5171
+ "grad_norm": 7.393056392669678,
5172
+ "learning_rate": 1.735513704823168e-06,
5173
+ "loss": 2.163,
5174
+ "step": 353500
5175
+ },
5176
+ {
5177
+ "epoch": 27.435480120902117,
5178
+ "grad_norm": 6.7227678298950195,
5179
+ "learning_rate": 1.7096799193985896e-06,
5180
+ "loss": 2.1678,
5181
+ "step": 354000
5182
+ },
5183
+ {
5184
+ "epoch": 27.474230799038985,
5185
+ "grad_norm": 6.587380409240723,
5186
+ "learning_rate": 1.6838461339740115e-06,
5187
+ "loss": 2.1611,
5188
+ "step": 354500
5189
+ },
5190
+ {
5191
+ "epoch": 27.51298147717585,
5192
+ "grad_norm": 7.290678977966309,
5193
+ "learning_rate": 1.6580123485494332e-06,
5194
+ "loss": 2.1555,
5195
+ "step": 355000
5196
+ },
5197
+ {
5198
+ "epoch": 27.551732155312717,
5199
+ "grad_norm": 6.52154016494751,
5200
+ "learning_rate": 1.6321785631248548e-06,
5201
+ "loss": 2.1487,
5202
+ "step": 355500
5203
+ },
5204
+ {
5205
+ "epoch": 27.590482833449585,
5206
+ "grad_norm": 6.613160610198975,
5207
+ "learning_rate": 1.6063447777002765e-06,
5208
+ "loss": 2.1599,
5209
+ "step": 356000
5210
+ },
5211
+ {
5212
+ "epoch": 27.629233511586452,
5213
+ "grad_norm": 7.148532390594482,
5214
+ "learning_rate": 1.5805109922756984e-06,
5215
+ "loss": 2.1731,
5216
+ "step": 356500
5217
+ },
5218
+ {
5219
+ "epoch": 27.66798418972332,
5220
+ "grad_norm": 6.29647159576416,
5221
+ "learning_rate": 1.5546772068511201e-06,
5222
+ "loss": 2.1641,
5223
+ "step": 357000
5224
+ },
5225
+ {
5226
+ "epoch": 27.706734867860188,
5227
+ "grad_norm": 6.647765636444092,
5228
+ "learning_rate": 1.5288434214265418e-06,
5229
+ "loss": 2.1756,
5230
+ "step": 357500
5231
+ },
5232
+ {
5233
+ "epoch": 27.745485545997056,
5234
+ "grad_norm": 6.541094779968262,
5235
+ "learning_rate": 1.5030096360019635e-06,
5236
+ "loss": 2.1584,
5237
+ "step": 358000
5238
+ },
5239
+ {
5240
+ "epoch": 27.784236224133924,
5241
+ "grad_norm": 7.08396053314209,
5242
+ "learning_rate": 1.4771758505773854e-06,
5243
+ "loss": 2.1551,
5244
+ "step": 358500
5245
+ },
5246
+ {
5247
+ "epoch": 27.822986902270788,
5248
+ "grad_norm": 6.8339643478393555,
5249
+ "learning_rate": 1.451342065152807e-06,
5250
+ "loss": 2.1575,
5251
+ "step": 359000
5252
+ },
5253
+ {
5254
+ "epoch": 27.861737580407656,
5255
+ "grad_norm": 6.175314903259277,
5256
+ "learning_rate": 1.4255082797282288e-06,
5257
+ "loss": 2.1312,
5258
+ "step": 359500
5259
+ },
5260
+ {
5261
+ "epoch": 27.900488258544524,
5262
+ "grad_norm": 6.25184965133667,
5263
+ "learning_rate": 1.3996744943036504e-06,
5264
+ "loss": 2.1509,
5265
+ "step": 360000
5266
+ },
5267
+ {
5268
+ "epoch": 27.93923893668139,
5269
+ "grad_norm": 7.08027982711792,
5270
+ "learning_rate": 1.3738407088790723e-06,
5271
+ "loss": 2.159,
5272
+ "step": 360500
5273
+ },
5274
+ {
5275
+ "epoch": 27.97798961481826,
5276
+ "grad_norm": 6.8008880615234375,
5277
+ "learning_rate": 1.348006923454494e-06,
5278
+ "loss": 2.1634,
5279
+ "step": 361000
5280
+ },
5281
+ {
5282
+ "epoch": 28.0,
5283
+ "eval_loss": 2.130622386932373,
5284
+ "eval_runtime": 269.6844,
5285
+ "eval_samples_per_second": 765.58,
5286
+ "eval_steps_per_second": 11.966,
5287
+ "step": 361284
5288
+ },
5289
+ {
5290
+ "epoch": 28.016740292955127,
5291
+ "grad_norm": 7.46795654296875,
5292
+ "learning_rate": 1.3221731380299157e-06,
5293
+ "loss": 2.1363,
5294
+ "step": 361500
5295
+ },
5296
+ {
5297
+ "epoch": 28.055490971091995,
5298
+ "grad_norm": 7.271740436553955,
5299
+ "learning_rate": 1.2963393526053374e-06,
5300
+ "loss": 2.1604,
5301
+ "step": 362000
5302
+ },
5303
+ {
5304
+ "epoch": 28.094241649228863,
5305
+ "grad_norm": 6.692265510559082,
5306
+ "learning_rate": 1.2705055671807593e-06,
5307
+ "loss": 2.1596,
5308
+ "step": 362500
5309
+ },
5310
+ {
5311
+ "epoch": 28.132992327365727,
5312
+ "grad_norm": 6.122591018676758,
5313
+ "learning_rate": 1.2446717817561808e-06,
5314
+ "loss": 2.1524,
5315
+ "step": 363000
5316
+ },
5317
+ {
5318
+ "epoch": 28.171743005502595,
5319
+ "grad_norm": 6.683858394622803,
5320
+ "learning_rate": 1.2188379963316027e-06,
5321
+ "loss": 2.1598,
5322
+ "step": 363500
5323
+ },
5324
+ {
5325
+ "epoch": 28.210493683639463,
5326
+ "grad_norm": 6.768929958343506,
5327
+ "learning_rate": 1.1930042109070243e-06,
5328
+ "loss": 2.1515,
5329
+ "step": 364000
5330
+ },
5331
+ {
5332
+ "epoch": 28.24924436177633,
5333
+ "grad_norm": 6.956704139709473,
5334
+ "learning_rate": 1.167170425482446e-06,
5335
+ "loss": 2.1552,
5336
+ "step": 364500
5337
+ },
5338
+ {
5339
+ "epoch": 28.2879950399132,
5340
+ "grad_norm": 6.655780792236328,
5341
+ "learning_rate": 1.1413366400578677e-06,
5342
+ "loss": 2.1551,
5343
+ "step": 365000
5344
+ },
5345
+ {
5346
+ "epoch": 28.326745718050066,
5347
+ "grad_norm": 7.394413471221924,
5348
+ "learning_rate": 1.1155028546332896e-06,
5349
+ "loss": 2.1465,
5350
+ "step": 365500
5351
+ },
5352
+ {
5353
+ "epoch": 28.365496396186934,
5354
+ "grad_norm": 7.250267505645752,
5355
+ "learning_rate": 1.0896690692087113e-06,
5356
+ "loss": 2.1729,
5357
+ "step": 366000
5358
+ },
5359
+ {
5360
+ "epoch": 28.404247074323802,
5361
+ "grad_norm": 6.102252960205078,
5362
+ "learning_rate": 1.063835283784133e-06,
5363
+ "loss": 2.1556,
5364
+ "step": 366500
5365
+ },
5366
+ {
5367
+ "epoch": 28.44299775246067,
5368
+ "grad_norm": 6.5598297119140625,
5369
+ "learning_rate": 1.0380014983595547e-06,
5370
+ "loss": 2.1473,
5371
+ "step": 367000
5372
+ },
5373
+ {
5374
+ "epoch": 28.481748430597534,
5375
+ "grad_norm": 7.368846416473389,
5376
+ "learning_rate": 1.0121677129349766e-06,
5377
+ "loss": 2.1552,
5378
+ "step": 367500
5379
+ },
5380
+ {
5381
+ "epoch": 28.520499108734402,
5382
+ "grad_norm": 6.635545253753662,
5383
+ "learning_rate": 9.863339275103983e-07,
5384
+ "loss": 2.1584,
5385
+ "step": 368000
5386
+ },
5387
+ {
5388
+ "epoch": 28.55924978687127,
5389
+ "grad_norm": 6.502518177032471,
5390
+ "learning_rate": 9.6050014208582e-07,
5391
+ "loss": 2.1669,
5392
+ "step": 368500
5393
+ },
5394
+ {
5395
+ "epoch": 28.598000465008138,
5396
+ "grad_norm": 7.150147914886475,
5397
+ "learning_rate": 9.346663566612417e-07,
5398
+ "loss": 2.158,
5399
+ "step": 369000
5400
+ },
5401
+ {
5402
+ "epoch": 28.636751143145005,
5403
+ "grad_norm": 6.391610622406006,
5404
+ "learning_rate": 9.088325712366634e-07,
5405
+ "loss": 2.1464,
5406
+ "step": 369500
5407
+ },
5408
+ {
5409
+ "epoch": 28.675501821281873,
5410
+ "grad_norm": 6.436591625213623,
5411
+ "learning_rate": 8.829987858120852e-07,
5412
+ "loss": 2.1438,
5413
+ "step": 370000
5414
+ },
5415
+ {
5416
+ "epoch": 28.71425249941874,
5417
+ "grad_norm": 6.646981716156006,
5418
+ "learning_rate": 8.571650003875069e-07,
5419
+ "loss": 2.1507,
5420
+ "step": 370500
5421
+ },
5422
+ {
5423
+ "epoch": 28.753003177555605,
5424
+ "grad_norm": 6.943175792694092,
5425
+ "learning_rate": 8.313312149629287e-07,
5426
+ "loss": 2.1483,
5427
+ "step": 371000
5428
+ },
5429
+ {
5430
+ "epoch": 28.791753855692473,
5431
+ "grad_norm": 6.345837116241455,
5432
+ "learning_rate": 8.054974295383504e-07,
5433
+ "loss": 2.1662,
5434
+ "step": 371500
5435
+ },
5436
+ {
5437
+ "epoch": 28.83050453382934,
5438
+ "grad_norm": 6.562370300292969,
5439
+ "learning_rate": 7.79663644113772e-07,
5440
+ "loss": 2.1554,
5441
+ "step": 372000
5442
+ },
5443
+ {
5444
+ "epoch": 28.86925521196621,
5445
+ "grad_norm": 6.556326866149902,
5446
+ "learning_rate": 7.538298586891937e-07,
5447
+ "loss": 2.1448,
5448
+ "step": 372500
5449
+ },
5450
+ {
5451
+ "epoch": 28.908005890103077,
5452
+ "grad_norm": 6.487407684326172,
5453
+ "learning_rate": 7.279960732646154e-07,
5454
+ "loss": 2.1425,
5455
+ "step": 373000
5456
+ },
5457
+ {
5458
+ "epoch": 28.946756568239945,
5459
+ "grad_norm": 7.614674091339111,
5460
+ "learning_rate": 7.021622878400372e-07,
5461
+ "loss": 2.1565,
5462
+ "step": 373500
5463
+ },
5464
+ {
5465
+ "epoch": 28.985507246376812,
5466
+ "grad_norm": 6.897189140319824,
5467
+ "learning_rate": 6.763285024154589e-07,
5468
+ "loss": 2.1438,
5469
+ "step": 374000
5470
+ },
5471
+ {
5472
+ "epoch": 29.0,
5473
+ "eval_loss": 2.1226651668548584,
5474
+ "eval_runtime": 266.9511,
5475
+ "eval_samples_per_second": 773.419,
5476
+ "eval_steps_per_second": 12.088,
5477
+ "step": 374187
5478
+ },
5479
+ {
5480
+ "epoch": 29.02425792451368,
5481
+ "grad_norm": 6.869750499725342,
5482
+ "learning_rate": 6.504947169908807e-07,
5483
+ "loss": 2.138,
5484
+ "step": 374500
5485
+ },
5486
+ {
5487
+ "epoch": 29.063008602650548,
5488
+ "grad_norm": 7.1249589920043945,
5489
+ "learning_rate": 6.246609315663025e-07,
5490
+ "loss": 2.1527,
5491
+ "step": 375000
5492
+ },
5493
+ {
5494
+ "epoch": 29.101759280787412,
5495
+ "grad_norm": 7.201192378997803,
5496
+ "learning_rate": 5.988271461417243e-07,
5497
+ "loss": 2.1517,
5498
+ "step": 375500
5499
+ },
5500
+ {
5501
+ "epoch": 29.14050995892428,
5502
+ "grad_norm": 6.720222473144531,
5503
+ "learning_rate": 5.729933607171459e-07,
5504
+ "loss": 2.1526,
5505
+ "step": 376000
5506
+ },
5507
+ {
5508
+ "epoch": 29.179260637061148,
5509
+ "grad_norm": 6.9030866622924805,
5510
+ "learning_rate": 5.471595752925676e-07,
5511
+ "loss": 2.1532,
5512
+ "step": 376500
5513
+ },
5514
+ {
5515
+ "epoch": 29.218011315198016,
5516
+ "grad_norm": 5.900801181793213,
5517
+ "learning_rate": 5.213257898679893e-07,
5518
+ "loss": 2.1534,
5519
+ "step": 377000
5520
+ },
5521
+ {
5522
+ "epoch": 29.256761993334884,
5523
+ "grad_norm": 6.259501934051514,
5524
+ "learning_rate": 4.954920044434111e-07,
5525
+ "loss": 2.1621,
5526
+ "step": 377500
5527
+ },
5528
+ {
5529
+ "epoch": 29.29551267147175,
5530
+ "grad_norm": 6.566405296325684,
5531
+ "learning_rate": 4.6965821901883286e-07,
5532
+ "loss": 2.1621,
5533
+ "step": 378000
5534
+ },
5535
+ {
5536
+ "epoch": 29.33426334960862,
5537
+ "grad_norm": 6.553793430328369,
5538
+ "learning_rate": 4.438244335942546e-07,
5539
+ "loss": 2.1631,
5540
+ "step": 378500
5541
+ },
5542
+ {
5543
+ "epoch": 29.373014027745487,
5544
+ "grad_norm": 6.773620128631592,
5545
+ "learning_rate": 4.1799064816967634e-07,
5546
+ "loss": 2.1556,
5547
+ "step": 379000
5548
+ },
5549
+ {
5550
+ "epoch": 29.41176470588235,
5551
+ "grad_norm": 6.494615077972412,
5552
+ "learning_rate": 3.921568627450981e-07,
5553
+ "loss": 2.1554,
5554
+ "step": 379500
5555
+ },
5556
+ {
5557
+ "epoch": 29.45051538401922,
5558
+ "grad_norm": 7.172949314117432,
5559
+ "learning_rate": 3.663230773205198e-07,
5560
+ "loss": 2.1494,
5561
+ "step": 380000
5562
+ },
5563
+ {
5564
+ "epoch": 29.489266062156087,
5565
+ "grad_norm": 6.8991241455078125,
5566
+ "learning_rate": 3.4048929189594155e-07,
5567
+ "loss": 2.1406,
5568
+ "step": 380500
5569
+ },
5570
+ {
5571
+ "epoch": 29.528016740292955,
5572
+ "grad_norm": 7.046799182891846,
5573
+ "learning_rate": 3.146555064713633e-07,
5574
+ "loss": 2.1493,
5575
+ "step": 381000
5576
+ },
5577
+ {
5578
+ "epoch": 29.566767418429823,
5579
+ "grad_norm": 6.826701641082764,
5580
+ "learning_rate": 2.8882172104678503e-07,
5581
+ "loss": 2.1459,
5582
+ "step": 381500
5583
+ },
5584
+ {
5585
+ "epoch": 29.60551809656669,
5586
+ "grad_norm": 6.649389743804932,
5587
+ "learning_rate": 2.6298793562220677e-07,
5588
+ "loss": 2.1593,
5589
+ "step": 382000
5590
+ },
5591
+ {
5592
+ "epoch": 29.64426877470356,
5593
+ "grad_norm": 6.10260009765625,
5594
+ "learning_rate": 2.3715415019762845e-07,
5595
+ "loss": 2.151,
5596
+ "step": 382500
5597
+ },
5598
+ {
5599
+ "epoch": 29.683019452840426,
5600
+ "grad_norm": 6.9101128578186035,
5601
+ "learning_rate": 2.113203647730502e-07,
5602
+ "loss": 2.1625,
5603
+ "step": 383000
5604
+ },
5605
+ {
5606
+ "epoch": 29.72177013097729,
5607
+ "grad_norm": 6.671387672424316,
5608
+ "learning_rate": 1.8548657934847193e-07,
5609
+ "loss": 2.1496,
5610
+ "step": 383500
5611
+ },
5612
+ {
5613
+ "epoch": 29.76052080911416,
5614
+ "grad_norm": 7.705864429473877,
5615
+ "learning_rate": 1.5965279392389367e-07,
5616
+ "loss": 2.1496,
5617
+ "step": 384000
5618
+ },
5619
+ {
5620
+ "epoch": 29.799271487251026,
5621
+ "grad_norm": 6.319827079772949,
5622
+ "learning_rate": 1.338190084993154e-07,
5623
+ "loss": 2.149,
5624
+ "step": 384500
5625
+ },
5626
+ {
5627
+ "epoch": 29.838022165387894,
5628
+ "grad_norm": 6.850592613220215,
5629
+ "learning_rate": 1.0798522307473716e-07,
5630
+ "loss": 2.1511,
5631
+ "step": 385000
5632
+ },
5633
+ {
5634
+ "epoch": 29.876772843524762,
5635
+ "grad_norm": 7.166690826416016,
5636
+ "learning_rate": 8.21514376501589e-08,
5637
+ "loss": 2.1492,
5638
+ "step": 385500
5639
+ },
5640
+ {
5641
+ "epoch": 29.91552352166163,
5642
+ "grad_norm": 6.324465274810791,
5643
+ "learning_rate": 5.631765222558062e-08,
5644
+ "loss": 2.1523,
5645
+ "step": 386000
5646
+ },
5647
+ {
5648
+ "epoch": 29.954274199798498,
5649
+ "grad_norm": 7.214087009429932,
5650
+ "learning_rate": 3.048386680100235e-08,
5651
+ "loss": 2.1628,
5652
+ "step": 386500
5653
+ },
5654
+ {
5655
+ "epoch": 29.993024877935365,
5656
+ "grad_norm": 6.88369607925415,
5657
+ "learning_rate": 4.6500813764240875e-09,
5658
+ "loss": 2.144,
5659
+ "step": 387000
5660
+ },
5661
+ {
5662
+ "epoch": 30.0,
5663
+ "eval_loss": 2.125218629837036,
5664
+ "eval_runtime": 267.0237,
5665
+ "eval_samples_per_second": 773.209,
5666
+ "eval_steps_per_second": 12.085,
5667
+ "step": 387090
5668
  }
5669
  ],
5670
  "logging_steps": 500,
5671
+ "max_steps": 387090,
5672
  "num_input_tokens_seen": 0,
5673
+ "num_train_epochs": 30,
5674
  "save_steps": 500,
5675
  "stateful_callbacks": {
5676
  "TrainerControl": {
 
5684
  "attributes": {}
5685
  }
5686
  },
5687
+ "total_flos": 9.706577784666885e+17,
5688
  "train_batch_size": 64,
5689
  "trial_name": null,
5690
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:74ba329b03609527987c8126060f1c2c7e67ac7fad6da2fce4fad8d3324d3a3b
3
  size 5048
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26bd31272e273002d078f94ae220a0225f0a03244f57837a66ef9002c2b6fb24
3
  size 5048