CocoRoF commited on
Commit
a7da077
·
verified ·
1 Parent(s): 2cb70e3

Training in progress, step 30000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd8fd0ff66e27efb1ed3a7e6f1973ec474b8b4a9f2bc4462fefbe4196ef0b8e6
3
  size 368988278
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:055db62e7afaf72547d8020ffa4c60d79b2df7d5d99747310e09d238a4ba1fa7
3
  size 368988278
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:40f46e4f09ecfddda081f3c817b19993ae436a048791221c20ca2ab6b4c612da
3
  size 1107079290
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c57ff1606838ae93b0606705e53592c3c93bfa3a777074b3409ef82ed78e848
3
  size 1107079290
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e735ed11597ed40a2b6854e0229902e1a21fedc0a0dbc608ca905fae57d5b06b
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1a2fbcd26bac3ea7dc02fc9ede5b8a1914ca51611473722a11a969e1f26ac0ee
3
  size 15984
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2ba3815fc0953b1b7f08cea092dfc0a62c4bbc2a2c68780d3f4dd0b5e22582a7
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66d97b511d2fdb8061e5bf72c139923941c148260fac1caedd654028da6986c1
3
  size 15984
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:647ac15563fcad903adbb616e9b2c36b237a3ed5939d088620212da969930f6c
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3839473129eb8c438ab312370daa55eb10a0790f33d38fc5eaa24859b54b0d1f
3
  size 15984
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93e3733c5b180986b7efbec17b663bf5231343d187374d184768fcd913797167
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5088a0d34c7015afe60457fbb3f0a4740839369017a42ea4b3250322c2d63ceb
3
  size 15984
last-checkpoint/rng_state_4.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9820ea4fec1b01f3da091290c3e8b5ddb86a3a3fa17285c248b64910c2d0b4f0
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f9cac0eb25286b75549fa2030810940adf357064a83facaf5c58ebe37190b6ac
3
  size 15984
last-checkpoint/rng_state_5.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7413035def085e41776a629afc94fc24fe5a955f1ad83b32f9b370ab60f9a18d
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0a57d29811122d52bd53f81af680412b91dde1cd2a12fa885d8a54388be8e2d
3
  size 15984
last-checkpoint/rng_state_6.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:91e3953bcbf4089415abffbd914fbbe4580121f6c843eabbf70624c5ed144814
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2c90ab29b255eaf920ecc1cba0b586e426f8e2db67b44a65576693f84178a04f
3
  size 15984
last-checkpoint/rng_state_7.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:369fde7bff4dfc0d6b9cf773cf9b0352696083f84763999e05a631ee6d52c5e3
3
  size 15984
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d4efbfa3cfb1bb8fb9c3380e65959a8b4eaf3bceb0507a26ffba1a3e4636ddb1
3
  size 15984
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d91b2e4d532624cd81aa3d0bf4043f84fcc3ffd3b07edd5e64ec534770339a6
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4460050461ccd15ef821d88f33ca8aec62edc9562663da8bad202acbfef43bd7
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.7224232967064722,
5
  "eval_steps": 3000,
6
- "global_step": 25000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3571,6 +3571,722 @@
3571
  "learning_rate": 1.1287904784265563e-07,
3572
  "loss": 15.5976,
3573
  "step": 25000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3574
  }
3575
  ],
3576
  "logging_steps": 50,
@@ -3590,7 +4306,7 @@
3590
  "attributes": {}
3591
  }
3592
  },
3593
- "total_flos": 4.365401538428928e+18,
3594
  "train_batch_size": 8,
3595
  "trial_name": null,
3596
  "trial_params": null
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 0.8669079560477666,
5
  "eval_steps": 3000,
6
+ "global_step": 30000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3571
  "learning_rate": 1.1287904784265563e-07,
3572
  "loss": 15.5976,
3573
  "step": 25000
3574
+ },
3575
+ {
3576
+ "epoch": 0.7238681432998851,
3577
+ "grad_norm": 20.765625,
3578
+ "learning_rate": 1.1310480593834094e-07,
3579
+ "loss": 15.651,
3580
+ "step": 25050
3581
+ },
3582
+ {
3583
+ "epoch": 0.7253129898932981,
3584
+ "grad_norm": 21.203125,
3585
+ "learning_rate": 1.1333056403402626e-07,
3586
+ "loss": 15.7333,
3587
+ "step": 25100
3588
+ },
3589
+ {
3590
+ "epoch": 0.726757836486711,
3591
+ "grad_norm": 22.25,
3592
+ "learning_rate": 1.1355632212971157e-07,
3593
+ "loss": 15.6802,
3594
+ "step": 25150
3595
+ },
3596
+ {
3597
+ "epoch": 0.7282026830801239,
3598
+ "grad_norm": 22.8125,
3599
+ "learning_rate": 1.1378208022539689e-07,
3600
+ "loss": 15.6639,
3601
+ "step": 25200
3602
+ },
3603
+ {
3604
+ "epoch": 0.729647529673537,
3605
+ "grad_norm": 21.140625,
3606
+ "learning_rate": 1.1400783832108218e-07,
3607
+ "loss": 15.6816,
3608
+ "step": 25250
3609
+ },
3610
+ {
3611
+ "epoch": 0.7310923762669499,
3612
+ "grad_norm": 23.0,
3613
+ "learning_rate": 1.1423359641676749e-07,
3614
+ "loss": 15.5984,
3615
+ "step": 25300
3616
+ },
3617
+ {
3618
+ "epoch": 0.7325372228603628,
3619
+ "grad_norm": 23.40625,
3620
+ "learning_rate": 1.1445935451245281e-07,
3621
+ "loss": 15.7119,
3622
+ "step": 25350
3623
+ },
3624
+ {
3625
+ "epoch": 0.7339820694537758,
3626
+ "grad_norm": 21.296875,
3627
+ "learning_rate": 1.1468511260813812e-07,
3628
+ "loss": 15.6212,
3629
+ "step": 25400
3630
+ },
3631
+ {
3632
+ "epoch": 0.7354269160471887,
3633
+ "grad_norm": 20.03125,
3634
+ "learning_rate": 1.1491087070382344e-07,
3635
+ "loss": 15.659,
3636
+ "step": 25450
3637
+ },
3638
+ {
3639
+ "epoch": 0.7368717626406016,
3640
+ "grad_norm": 22.984375,
3641
+ "learning_rate": 1.1513662879950876e-07,
3642
+ "loss": 15.771,
3643
+ "step": 25500
3644
+ },
3645
+ {
3646
+ "epoch": 0.7383166092340145,
3647
+ "grad_norm": 21.84375,
3648
+ "learning_rate": 1.1536238689519404e-07,
3649
+ "loss": 15.7338,
3650
+ "step": 25550
3651
+ },
3652
+ {
3653
+ "epoch": 0.7397614558274275,
3654
+ "grad_norm": 22.234375,
3655
+ "learning_rate": 1.1558814499087936e-07,
3656
+ "loss": 15.6507,
3657
+ "step": 25600
3658
+ },
3659
+ {
3660
+ "epoch": 0.7412063024208405,
3661
+ "grad_norm": 25.3125,
3662
+ "learning_rate": 1.1581390308656468e-07,
3663
+ "loss": 15.7084,
3664
+ "step": 25650
3665
+ },
3666
+ {
3667
+ "epoch": 0.7426511490142534,
3668
+ "grad_norm": 26.171875,
3669
+ "learning_rate": 1.1603966118224999e-07,
3670
+ "loss": 15.5601,
3671
+ "step": 25700
3672
+ },
3673
+ {
3674
+ "epoch": 0.7440959956076664,
3675
+ "grad_norm": 22.515625,
3676
+ "learning_rate": 1.1626541927793531e-07,
3677
+ "loss": 15.7191,
3678
+ "step": 25750
3679
+ },
3680
+ {
3681
+ "epoch": 0.7455408422010793,
3682
+ "grad_norm": 24.375,
3683
+ "learning_rate": 1.1649117737362062e-07,
3684
+ "loss": 15.6457,
3685
+ "step": 25800
3686
+ },
3687
+ {
3688
+ "epoch": 0.7469856887944922,
3689
+ "grad_norm": 23.640625,
3690
+ "learning_rate": 1.1671693546930592e-07,
3691
+ "loss": 15.572,
3692
+ "step": 25850
3693
+ },
3694
+ {
3695
+ "epoch": 0.7484305353879052,
3696
+ "grad_norm": 24.375,
3697
+ "learning_rate": 1.1694269356499123e-07,
3698
+ "loss": 15.6297,
3699
+ "step": 25900
3700
+ },
3701
+ {
3702
+ "epoch": 0.7498753819813181,
3703
+ "grad_norm": 23.8125,
3704
+ "learning_rate": 1.1716845166067654e-07,
3705
+ "loss": 15.6828,
3706
+ "step": 25950
3707
+ },
3708
+ {
3709
+ "epoch": 0.7513202285747311,
3710
+ "grad_norm": 23.953125,
3711
+ "learning_rate": 1.1739420975636186e-07,
3712
+ "loss": 15.5568,
3713
+ "step": 26000
3714
+ },
3715
+ {
3716
+ "epoch": 0.7527650751681441,
3717
+ "grad_norm": 25.421875,
3718
+ "learning_rate": 1.1761996785204717e-07,
3719
+ "loss": 15.6016,
3720
+ "step": 26050
3721
+ },
3722
+ {
3723
+ "epoch": 0.754209921761557,
3724
+ "grad_norm": 22.15625,
3725
+ "learning_rate": 1.1784572594773249e-07,
3726
+ "loss": 15.5887,
3727
+ "step": 26100
3728
+ },
3729
+ {
3730
+ "epoch": 0.7556547683549699,
3731
+ "grad_norm": 21.5625,
3732
+ "learning_rate": 1.1807148404341779e-07,
3733
+ "loss": 15.6077,
3734
+ "step": 26150
3735
+ },
3736
+ {
3737
+ "epoch": 0.7570996149483828,
3738
+ "grad_norm": 23.328125,
3739
+ "learning_rate": 1.1829724213910311e-07,
3740
+ "loss": 15.6592,
3741
+ "step": 26200
3742
+ },
3743
+ {
3744
+ "epoch": 0.7585444615417958,
3745
+ "grad_norm": 23.71875,
3746
+ "learning_rate": 1.1852300023478841e-07,
3747
+ "loss": 15.621,
3748
+ "step": 26250
3749
+ },
3750
+ {
3751
+ "epoch": 0.7599893081352087,
3752
+ "grad_norm": 24.5625,
3753
+ "learning_rate": 1.1874875833047373e-07,
3754
+ "loss": 15.5728,
3755
+ "step": 26300
3756
+ },
3757
+ {
3758
+ "epoch": 0.7614341547286216,
3759
+ "grad_norm": 23.21875,
3760
+ "learning_rate": 1.1897451642615904e-07,
3761
+ "loss": 15.6658,
3762
+ "step": 26350
3763
+ },
3764
+ {
3765
+ "epoch": 0.7628790013220347,
3766
+ "grad_norm": 24.96875,
3767
+ "learning_rate": 1.1920027452184434e-07,
3768
+ "loss": 15.4367,
3769
+ "step": 26400
3770
+ },
3771
+ {
3772
+ "epoch": 0.7643238479154476,
3773
+ "grad_norm": 23.296875,
3774
+ "learning_rate": 1.1942603261752967e-07,
3775
+ "loss": 15.6812,
3776
+ "step": 26450
3777
+ },
3778
+ {
3779
+ "epoch": 0.7657686945088605,
3780
+ "grad_norm": 21.21875,
3781
+ "learning_rate": 1.1965179071321497e-07,
3782
+ "loss": 15.4966,
3783
+ "step": 26500
3784
+ },
3785
+ {
3786
+ "epoch": 0.7672135411022735,
3787
+ "grad_norm": 21.859375,
3788
+ "learning_rate": 1.1987754880890028e-07,
3789
+ "loss": 15.6969,
3790
+ "step": 26550
3791
+ },
3792
+ {
3793
+ "epoch": 0.7686583876956864,
3794
+ "grad_norm": 21.234375,
3795
+ "learning_rate": 1.2010330690458558e-07,
3796
+ "loss": 15.5063,
3797
+ "step": 26600
3798
+ },
3799
+ {
3800
+ "epoch": 0.7701032342890993,
3801
+ "grad_norm": 30.65625,
3802
+ "learning_rate": 1.203290650002709e-07,
3803
+ "loss": 15.5682,
3804
+ "step": 26650
3805
+ },
3806
+ {
3807
+ "epoch": 0.7715480808825123,
3808
+ "grad_norm": 23.0625,
3809
+ "learning_rate": 1.205548230959562e-07,
3810
+ "loss": 15.5967,
3811
+ "step": 26700
3812
+ },
3813
+ {
3814
+ "epoch": 0.7729929274759253,
3815
+ "grad_norm": 22.171875,
3816
+ "learning_rate": 1.2078058119164154e-07,
3817
+ "loss": 15.5911,
3818
+ "step": 26750
3819
+ },
3820
+ {
3821
+ "epoch": 0.7744377740693382,
3822
+ "grad_norm": 25.8125,
3823
+ "learning_rate": 1.2100633928732684e-07,
3824
+ "loss": 15.6635,
3825
+ "step": 26800
3826
+ },
3827
+ {
3828
+ "epoch": 0.7758826206627512,
3829
+ "grad_norm": 23.40625,
3830
+ "learning_rate": 1.2123209738301214e-07,
3831
+ "loss": 15.5876,
3832
+ "step": 26850
3833
+ },
3834
+ {
3835
+ "epoch": 0.7773274672561641,
3836
+ "grad_norm": 21.15625,
3837
+ "learning_rate": 1.2145785547869745e-07,
3838
+ "loss": 15.5192,
3839
+ "step": 26900
3840
+ },
3841
+ {
3842
+ "epoch": 0.778772313849577,
3843
+ "grad_norm": 23.5625,
3844
+ "learning_rate": 1.2168361357438277e-07,
3845
+ "loss": 15.5746,
3846
+ "step": 26950
3847
+ },
3848
+ {
3849
+ "epoch": 0.7802171604429899,
3850
+ "grad_norm": 27.359375,
3851
+ "learning_rate": 1.2190937167006808e-07,
3852
+ "loss": 15.5407,
3853
+ "step": 27000
3854
+ },
3855
+ {
3856
+ "epoch": 0.7802171604429899,
3857
+ "eval_loss": 1.9437412023544312,
3858
+ "eval_runtime": 340.4,
3859
+ "eval_samples_per_second": 2739.524,
3860
+ "eval_steps_per_second": 42.806,
3861
+ "step": 27000
3862
+ },
3863
+ {
3864
+ "epoch": 0.7816620070364029,
3865
+ "grad_norm": 23.0625,
3866
+ "learning_rate": 1.221351297657534e-07,
3867
+ "loss": 15.609,
3868
+ "step": 27050
3869
+ },
3870
+ {
3871
+ "epoch": 0.7831068536298158,
3872
+ "grad_norm": 25.40625,
3873
+ "learning_rate": 1.223608878614387e-07,
3874
+ "loss": 15.6637,
3875
+ "step": 27100
3876
+ },
3877
+ {
3878
+ "epoch": 0.7845517002232288,
3879
+ "grad_norm": 23.90625,
3880
+ "learning_rate": 1.22586645957124e-07,
3881
+ "loss": 15.6405,
3882
+ "step": 27150
3883
+ },
3884
+ {
3885
+ "epoch": 0.7859965468166418,
3886
+ "grad_norm": 22.390625,
3887
+ "learning_rate": 1.2281240405280934e-07,
3888
+ "loss": 15.5515,
3889
+ "step": 27200
3890
+ },
3891
+ {
3892
+ "epoch": 0.7874413934100547,
3893
+ "grad_norm": 25.265625,
3894
+ "learning_rate": 1.2303816214849464e-07,
3895
+ "loss": 15.5254,
3896
+ "step": 27250
3897
+ },
3898
+ {
3899
+ "epoch": 0.7888862400034676,
3900
+ "grad_norm": 22.125,
3901
+ "learning_rate": 1.2326392024417994e-07,
3902
+ "loss": 15.5474,
3903
+ "step": 27300
3904
+ },
3905
+ {
3906
+ "epoch": 0.7903310865968806,
3907
+ "grad_norm": 23.03125,
3908
+ "learning_rate": 1.2348967833986527e-07,
3909
+ "loss": 15.554,
3910
+ "step": 27350
3911
+ },
3912
+ {
3913
+ "epoch": 0.7917759331902935,
3914
+ "grad_norm": 19.96875,
3915
+ "learning_rate": 1.2371543643555057e-07,
3916
+ "loss": 15.5717,
3917
+ "step": 27400
3918
+ },
3919
+ {
3920
+ "epoch": 0.7932207797837064,
3921
+ "grad_norm": 20.53125,
3922
+ "learning_rate": 1.2394119453123588e-07,
3923
+ "loss": 15.5454,
3924
+ "step": 27450
3925
+ },
3926
+ {
3927
+ "epoch": 0.7946656263771195,
3928
+ "grad_norm": 21.34375,
3929
+ "learning_rate": 1.241669526269212e-07,
3930
+ "loss": 15.5759,
3931
+ "step": 27500
3932
+ },
3933
+ {
3934
+ "epoch": 0.7961104729705324,
3935
+ "grad_norm": 23.9375,
3936
+ "learning_rate": 1.243927107226065e-07,
3937
+ "loss": 15.5199,
3938
+ "step": 27550
3939
+ },
3940
+ {
3941
+ "epoch": 0.7975553195639453,
3942
+ "grad_norm": 21.84375,
3943
+ "learning_rate": 1.246184688182918e-07,
3944
+ "loss": 15.4171,
3945
+ "step": 27600
3946
+ },
3947
+ {
3948
+ "epoch": 0.7990001661573582,
3949
+ "grad_norm": 22.234375,
3950
+ "learning_rate": 1.2484422691397714e-07,
3951
+ "loss": 15.5973,
3952
+ "step": 27650
3953
+ },
3954
+ {
3955
+ "epoch": 0.8004450127507712,
3956
+ "grad_norm": 22.421875,
3957
+ "learning_rate": 1.2506998500966244e-07,
3958
+ "loss": 15.4923,
3959
+ "step": 27700
3960
+ },
3961
+ {
3962
+ "epoch": 0.8018898593441841,
3963
+ "grad_norm": 21.34375,
3964
+ "learning_rate": 1.2529574310534774e-07,
3965
+ "loss": 15.509,
3966
+ "step": 27750
3967
+ },
3968
+ {
3969
+ "epoch": 0.803334705937597,
3970
+ "grad_norm": 24.359375,
3971
+ "learning_rate": 1.2552150120103305e-07,
3972
+ "loss": 15.5824,
3973
+ "step": 27800
3974
+ },
3975
+ {
3976
+ "epoch": 0.80477955253101,
3977
+ "grad_norm": 21.25,
3978
+ "learning_rate": 1.2574725929671838e-07,
3979
+ "loss": 15.5973,
3980
+ "step": 27850
3981
+ },
3982
+ {
3983
+ "epoch": 0.806224399124423,
3984
+ "grad_norm": 27.984375,
3985
+ "learning_rate": 1.2597301739240368e-07,
3986
+ "loss": 15.564,
3987
+ "step": 27900
3988
+ },
3989
+ {
3990
+ "epoch": 0.8076692457178359,
3991
+ "grad_norm": 33.71875,
3992
+ "learning_rate": 1.26198775488089e-07,
3993
+ "loss": 15.439,
3994
+ "step": 27950
3995
+ },
3996
+ {
3997
+ "epoch": 0.8091140923112489,
3998
+ "grad_norm": 24.09375,
3999
+ "learning_rate": 1.264245335837743e-07,
4000
+ "loss": 15.5574,
4001
+ "step": 28000
4002
+ },
4003
+ {
4004
+ "epoch": 0.8105589389046618,
4005
+ "grad_norm": 22.78125,
4006
+ "learning_rate": 1.266502916794596e-07,
4007
+ "loss": 15.5851,
4008
+ "step": 28050
4009
+ },
4010
+ {
4011
+ "epoch": 0.8120037854980747,
4012
+ "grad_norm": 21.125,
4013
+ "learning_rate": 1.2687604977514494e-07,
4014
+ "loss": 15.5334,
4015
+ "step": 28100
4016
+ },
4017
+ {
4018
+ "epoch": 0.8134486320914877,
4019
+ "grad_norm": 21.5625,
4020
+ "learning_rate": 1.2710180787083024e-07,
4021
+ "loss": 15.5789,
4022
+ "step": 28150
4023
+ },
4024
+ {
4025
+ "epoch": 0.8148934786849006,
4026
+ "grad_norm": 21.8125,
4027
+ "learning_rate": 1.2732756596651557e-07,
4028
+ "loss": 15.4793,
4029
+ "step": 28200
4030
+ },
4031
+ {
4032
+ "epoch": 0.8163383252783136,
4033
+ "grad_norm": 21.234375,
4034
+ "learning_rate": 1.2755332406220087e-07,
4035
+ "loss": 15.5015,
4036
+ "step": 28250
4037
+ },
4038
+ {
4039
+ "epoch": 0.8177831718717266,
4040
+ "grad_norm": 21.203125,
4041
+ "learning_rate": 1.2777908215788618e-07,
4042
+ "loss": 15.3711,
4043
+ "step": 28300
4044
+ },
4045
+ {
4046
+ "epoch": 0.8192280184651395,
4047
+ "grad_norm": 23.84375,
4048
+ "learning_rate": 1.280048402535715e-07,
4049
+ "loss": 15.4745,
4050
+ "step": 28350
4051
+ },
4052
+ {
4053
+ "epoch": 0.8206728650585524,
4054
+ "grad_norm": 22.03125,
4055
+ "learning_rate": 1.2823059834925678e-07,
4056
+ "loss": 15.5285,
4057
+ "step": 28400
4058
+ },
4059
+ {
4060
+ "epoch": 0.8221177116519653,
4061
+ "grad_norm": 20.9375,
4062
+ "learning_rate": 1.284563564449421e-07,
4063
+ "loss": 15.5214,
4064
+ "step": 28450
4065
+ },
4066
+ {
4067
+ "epoch": 0.8235625582453783,
4068
+ "grad_norm": 25.546875,
4069
+ "learning_rate": 1.286821145406274e-07,
4070
+ "loss": 15.4168,
4071
+ "step": 28500
4072
+ },
4073
+ {
4074
+ "epoch": 0.8250074048387912,
4075
+ "grad_norm": 24.265625,
4076
+ "learning_rate": 1.2890787263631271e-07,
4077
+ "loss": 15.5043,
4078
+ "step": 28550
4079
+ },
4080
+ {
4081
+ "epoch": 0.8264522514322041,
4082
+ "grad_norm": 23.265625,
4083
+ "learning_rate": 1.2913363073199804e-07,
4084
+ "loss": 15.4206,
4085
+ "step": 28600
4086
+ },
4087
+ {
4088
+ "epoch": 0.8278970980256172,
4089
+ "grad_norm": 22.0,
4090
+ "learning_rate": 1.2935938882768334e-07,
4091
+ "loss": 15.4444,
4092
+ "step": 28650
4093
+ },
4094
+ {
4095
+ "epoch": 0.8293419446190301,
4096
+ "grad_norm": 25.09375,
4097
+ "learning_rate": 1.2958514692336867e-07,
4098
+ "loss": 15.4043,
4099
+ "step": 28700
4100
+ },
4101
+ {
4102
+ "epoch": 0.830786791212443,
4103
+ "grad_norm": 21.046875,
4104
+ "learning_rate": 1.2981090501905398e-07,
4105
+ "loss": 15.5465,
4106
+ "step": 28750
4107
+ },
4108
+ {
4109
+ "epoch": 0.832231637805856,
4110
+ "grad_norm": 21.234375,
4111
+ "learning_rate": 1.300366631147393e-07,
4112
+ "loss": 15.4988,
4113
+ "step": 28800
4114
+ },
4115
+ {
4116
+ "epoch": 0.8336764843992689,
4117
+ "grad_norm": 21.046875,
4118
+ "learning_rate": 1.302624212104246e-07,
4119
+ "loss": 15.4368,
4120
+ "step": 28850
4121
+ },
4122
+ {
4123
+ "epoch": 0.8351213309926818,
4124
+ "grad_norm": 23.46875,
4125
+ "learning_rate": 1.304881793061099e-07,
4126
+ "loss": 15.4251,
4127
+ "step": 28900
4128
+ },
4129
+ {
4130
+ "epoch": 0.8365661775860948,
4131
+ "grad_norm": 23.046875,
4132
+ "learning_rate": 1.3071393740179524e-07,
4133
+ "loss": 15.4271,
4134
+ "step": 28950
4135
+ },
4136
+ {
4137
+ "epoch": 0.8380110241795078,
4138
+ "grad_norm": 21.0,
4139
+ "learning_rate": 1.3093969549748054e-07,
4140
+ "loss": 15.4439,
4141
+ "step": 29000
4142
+ },
4143
+ {
4144
+ "epoch": 0.8394558707729207,
4145
+ "grad_norm": 21.96875,
4146
+ "learning_rate": 1.3116545359316584e-07,
4147
+ "loss": 15.4197,
4148
+ "step": 29050
4149
+ },
4150
+ {
4151
+ "epoch": 0.8409007173663336,
4152
+ "grad_norm": 21.109375,
4153
+ "learning_rate": 1.3139121168885115e-07,
4154
+ "loss": 15.428,
4155
+ "step": 29100
4156
+ },
4157
+ {
4158
+ "epoch": 0.8423455639597466,
4159
+ "grad_norm": 21.984375,
4160
+ "learning_rate": 1.3161696978453645e-07,
4161
+ "loss": 15.3989,
4162
+ "step": 29150
4163
+ },
4164
+ {
4165
+ "epoch": 0.8437904105531595,
4166
+ "grad_norm": 21.921875,
4167
+ "learning_rate": 1.3184272788022178e-07,
4168
+ "loss": 15.4178,
4169
+ "step": 29200
4170
+ },
4171
+ {
4172
+ "epoch": 0.8452352571465724,
4173
+ "grad_norm": 20.765625,
4174
+ "learning_rate": 1.3206848597590708e-07,
4175
+ "loss": 15.3614,
4176
+ "step": 29250
4177
+ },
4178
+ {
4179
+ "epoch": 0.8466801037399854,
4180
+ "grad_norm": 21.390625,
4181
+ "learning_rate": 1.322942440715924e-07,
4182
+ "loss": 15.4306,
4183
+ "step": 29300
4184
+ },
4185
+ {
4186
+ "epoch": 0.8481249503333983,
4187
+ "grad_norm": 28.84375,
4188
+ "learning_rate": 1.325200021672777e-07,
4189
+ "loss": 15.4706,
4190
+ "step": 29350
4191
+ },
4192
+ {
4193
+ "epoch": 0.8495697969268113,
4194
+ "grad_norm": 22.921875,
4195
+ "learning_rate": 1.32745760262963e-07,
4196
+ "loss": 15.4703,
4197
+ "step": 29400
4198
+ },
4199
+ {
4200
+ "epoch": 0.8510146435202243,
4201
+ "grad_norm": 36.5625,
4202
+ "learning_rate": 1.3297151835864834e-07,
4203
+ "loss": 15.45,
4204
+ "step": 29450
4205
+ },
4206
+ {
4207
+ "epoch": 0.8524594901136372,
4208
+ "grad_norm": 24.25,
4209
+ "learning_rate": 1.3319727645433364e-07,
4210
+ "loss": 15.4233,
4211
+ "step": 29500
4212
+ },
4213
+ {
4214
+ "epoch": 0.8539043367070501,
4215
+ "grad_norm": 34.96875,
4216
+ "learning_rate": 1.3342303455001897e-07,
4217
+ "loss": 15.4693,
4218
+ "step": 29550
4219
+ },
4220
+ {
4221
+ "epoch": 0.8553491833004631,
4222
+ "grad_norm": 21.953125,
4223
+ "learning_rate": 1.3364879264570427e-07,
4224
+ "loss": 15.4402,
4225
+ "step": 29600
4226
+ },
4227
+ {
4228
+ "epoch": 0.856794029893876,
4229
+ "grad_norm": 22.40625,
4230
+ "learning_rate": 1.338745507413896e-07,
4231
+ "loss": 15.4761,
4232
+ "step": 29650
4233
+ },
4234
+ {
4235
+ "epoch": 0.8582388764872889,
4236
+ "grad_norm": 20.625,
4237
+ "learning_rate": 1.341003088370749e-07,
4238
+ "loss": 15.3796,
4239
+ "step": 29700
4240
+ },
4241
+ {
4242
+ "epoch": 0.859683723080702,
4243
+ "grad_norm": 21.328125,
4244
+ "learning_rate": 1.3432606693276018e-07,
4245
+ "loss": 15.3857,
4246
+ "step": 29750
4247
+ },
4248
+ {
4249
+ "epoch": 0.8611285696741149,
4250
+ "grad_norm": 23.375,
4251
+ "learning_rate": 1.345518250284455e-07,
4252
+ "loss": 15.414,
4253
+ "step": 29800
4254
+ },
4255
+ {
4256
+ "epoch": 0.8625734162675278,
4257
+ "grad_norm": 22.671875,
4258
+ "learning_rate": 1.347775831241308e-07,
4259
+ "loss": 15.3401,
4260
+ "step": 29850
4261
+ },
4262
+ {
4263
+ "epoch": 0.8640182628609407,
4264
+ "grad_norm": 22.65625,
4265
+ "learning_rate": 1.3500334121981614e-07,
4266
+ "loss": 15.346,
4267
+ "step": 29900
4268
+ },
4269
+ {
4270
+ "epoch": 0.8654631094543537,
4271
+ "grad_norm": 23.890625,
4272
+ "learning_rate": 1.3522909931550144e-07,
4273
+ "loss": 15.42,
4274
+ "step": 29950
4275
+ },
4276
+ {
4277
+ "epoch": 0.8669079560477666,
4278
+ "grad_norm": 20.515625,
4279
+ "learning_rate": 1.3545485741118675e-07,
4280
+ "loss": 15.389,
4281
+ "step": 30000
4282
+ },
4283
+ {
4284
+ "epoch": 0.8669079560477666,
4285
+ "eval_loss": 1.9219062328338623,
4286
+ "eval_runtime": 349.965,
4287
+ "eval_samples_per_second": 2664.65,
4288
+ "eval_steps_per_second": 41.636,
4289
+ "step": 30000
4290
  }
4291
  ],
4292
  "logging_steps": 50,
 
4306
  "attributes": {}
4307
  }
4308
  },
4309
+ "total_flos": 5.238481846114714e+18,
4310
  "train_batch_size": 8,
4311
  "trial_name": null,
4312
  "trial_params": null