Upload 8 files

Browse files

Files changed (8) hide show

README.md +200 -1
adapter_config.json +30 -0
adapter_model.safetensors +3 -0
optimizer.pt +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
trainer_state.json +1365 -0
training_args.bin +3 -0

README.md CHANGED Viewed

@@ -1,3 +1,202 @@
 ---
-license: mit
 ---

 ---
+library_name: peft
+base_model: unsloth/llama-3-8b
 ---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.11.1

adapter_config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "unsloth/llama-3-8b",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 32,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "out_proj",
+    "up_proj"
+  ],
+  "task_type": "CAUSAL_LM",
+  "use_dora": false,
+  "use_rslora": false
+}

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3807584c29359449e0e43f01225588c64c314f597f783509bb53914009922c77
+size 75514264

optimizer.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b5aebf9d35ce79314e0918724462e94a45ffec39322df20455e1f695824451dc
+size 151103098

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a977537baf79b08794f5b5d5eb700635c0534aff659cb33f857ba03c3c7bc89
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ff3df154d503558a7bf306c75c56049db853b80cdc6fa22e95bde8189c670a68
+size 1064

trainer_state.json ADDED Viewed

	@@ -0,0 +1,1365 @@

+{
+  "best_metric": 0.2809600234031677,
+  "best_model_checkpoint": "./lora_bn_resume/checkpoint-1800",
+  "epoch": 1.157556270096463,
+  "eval_steps": 200,
+  "global_step": 1800,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.006430868167202572,
+      "grad_norm": 0.3332834541797638,
+      "learning_rate": 2.9999999999999997e-05,
+      "loss": 0.643,
+      "step": 10
+    },
+    {
+      "epoch": 0.012861736334405145,
+      "grad_norm": 0.3670434355735779,
+      "learning_rate": 5.9999999999999995e-05,
+      "loss": 0.5391,
+      "step": 20
+    },
+    {
+      "epoch": 0.01929260450160772,
+      "grad_norm": 0.2655458152294159,
+      "learning_rate": 8.999999999999999e-05,
+      "loss": 0.4222,
+      "step": 30
+    },
+    {
+      "epoch": 0.02572347266881029,
+      "grad_norm": 0.18765214085578918,
+      "learning_rate": 0.00011999999999999999,
+      "loss": 0.38,
+      "step": 40
+    },
+    {
+      "epoch": 0.03215434083601286,
+      "grad_norm": 0.17279677093029022,
+      "learning_rate": 0.00015,
+      "loss": 0.3474,
+      "step": 50
+    },
+    {
+      "epoch": 0.03858520900321544,
+      "grad_norm": 0.19536016881465912,
+      "learning_rate": 0.00017999999999999998,
+      "loss": 0.3482,
+      "step": 60
+    },
+    {
+      "epoch": 0.04501607717041801,
+      "grad_norm": 0.24155691266059875,
+      "learning_rate": 0.00020999999999999998,
+      "loss": 0.3519,
+      "step": 70
+    },
+    {
+      "epoch": 0.05144694533762058,
+      "grad_norm": 0.15769515931606293,
+      "learning_rate": 0.00023999999999999998,
+      "loss": 0.3442,
+      "step": 80
+    },
+    {
+      "epoch": 0.05787781350482315,
+      "grad_norm": 0.15895752608776093,
+      "learning_rate": 0.00027,
+      "loss": 0.3419,
+      "step": 90
+    },
+    {
+      "epoch": 0.06430868167202572,
+      "grad_norm": 0.1517348736524582,
+      "learning_rate": 0.0003,
+      "loss": 0.3325,
+      "step": 100
+    },
+    {
+      "epoch": 0.0707395498392283,
+      "grad_norm": 0.1374281644821167,
+      "learning_rate": 0.00029934282584884994,
+      "loss": 0.3364,
+      "step": 110
+    },
+    {
+      "epoch": 0.07717041800643087,
+      "grad_norm": 0.14599083364009857,
+      "learning_rate": 0.00029868565169769985,
+      "loss": 0.3296,
+      "step": 120
+    },
+    {
+      "epoch": 0.08360128617363344,
+      "grad_norm": 0.1735353320837021,
+      "learning_rate": 0.0002980284775465498,
+      "loss": 0.3296,
+      "step": 130
+    },
+    {
+      "epoch": 0.09003215434083602,
+      "grad_norm": 0.16351208090782166,
+      "learning_rate": 0.00029737130339539973,
+      "loss": 0.3359,
+      "step": 140
+    },
+    {
+      "epoch": 0.09646302250803858,
+      "grad_norm": 0.15383951365947723,
+      "learning_rate": 0.0002967141292442497,
+      "loss": 0.3401,
+      "step": 150
+    },
+    {
+      "epoch": 0.10289389067524116,
+      "grad_norm": 0.16704361140727997,
+      "learning_rate": 0.00029605695509309966,
+      "loss": 0.3378,
+      "step": 160
+    },
+    {
+      "epoch": 0.10932475884244373,
+      "grad_norm": 0.14677385985851288,
+      "learning_rate": 0.00029539978094194957,
+      "loss": 0.3404,
+      "step": 170
+    },
+    {
+      "epoch": 0.1157556270096463,
+      "grad_norm": 0.2054668664932251,
+      "learning_rate": 0.00029474260679079954,
+      "loss": 0.3379,
+      "step": 180
+    },
+    {
+      "epoch": 0.12218649517684887,
+      "grad_norm": 0.15476278960704803,
+      "learning_rate": 0.00029408543263964945,
+      "loss": 0.3395,
+      "step": 190
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "grad_norm": 0.1571033000946045,
+      "learning_rate": 0.0002934282584884994,
+      "loss": 0.3318,
+      "step": 200
+    },
+    {
+      "epoch": 0.12861736334405144,
+      "eval_loss": 0.32538482546806335,
+      "eval_runtime": 830.7418,
+      "eval_samples_per_second": 2.407,
+      "eval_steps_per_second": 2.407,
+      "step": 200
+    },
+    {
+      "epoch": 0.13504823151125403,
+      "grad_norm": 0.13506701588630676,
+      "learning_rate": 0.0002927710843373494,
+      "loss": 0.3316,
+      "step": 210
+    },
+    {
+      "epoch": 0.1414790996784566,
+      "grad_norm": 0.16322733461856842,
+      "learning_rate": 0.0002921139101861993,
+      "loss": 0.3415,
+      "step": 220
+    },
+    {
+      "epoch": 0.14790996784565916,
+      "grad_norm": 0.16979773342609406,
+      "learning_rate": 0.00029145673603504926,
+      "loss": 0.3272,
+      "step": 230
+    },
+    {
+      "epoch": 0.15434083601286175,
+      "grad_norm": 0.1605272740125656,
+      "learning_rate": 0.0002907995618838992,
+      "loss": 0.3292,
+      "step": 240
+    },
+    {
+      "epoch": 0.1607717041800643,
+      "grad_norm": 0.16240820288658142,
+      "learning_rate": 0.00029014238773274913,
+      "loss": 0.3256,
+      "step": 250
+    },
+    {
+      "epoch": 0.16720257234726688,
+      "grad_norm": 0.14710214734077454,
+      "learning_rate": 0.0002894852135815991,
+      "loss": 0.3234,
+      "step": 260
+    },
+    {
+      "epoch": 0.17363344051446947,
+      "grad_norm": 0.14770746231079102,
+      "learning_rate": 0.000288828039430449,
+      "loss": 0.3264,
+      "step": 270
+    },
+    {
+      "epoch": 0.18006430868167203,
+      "grad_norm": 0.14894965291023254,
+      "learning_rate": 0.000288170865279299,
+      "loss": 0.3211,
+      "step": 280
+    },
+    {
+      "epoch": 0.1864951768488746,
+      "grad_norm": 0.1738394796848297,
+      "learning_rate": 0.00028751369112814894,
+      "loss": 0.3107,
+      "step": 290
+    },
+    {
+      "epoch": 0.19292604501607716,
+      "grad_norm": 0.15344616770744324,
+      "learning_rate": 0.00028685651697699885,
+      "loss": 0.3126,
+      "step": 300
+    },
+    {
+      "epoch": 0.19935691318327975,
+      "grad_norm": 0.180983766913414,
+      "learning_rate": 0.0002861993428258488,
+      "loss": 0.3142,
+      "step": 310
+    },
+    {
+      "epoch": 0.2057877813504823,
+      "grad_norm": 0.17168308794498444,
+      "learning_rate": 0.00028554216867469873,
+      "loss": 0.3207,
+      "step": 320
+    },
+    {
+      "epoch": 0.21221864951768488,
+      "grad_norm": 0.17984597384929657,
+      "learning_rate": 0.0002848849945235487,
+      "loss": 0.3158,
+      "step": 330
+    },
+    {
+      "epoch": 0.21864951768488747,
+      "grad_norm": 0.15579424798488617,
+      "learning_rate": 0.00028422782037239866,
+      "loss": 0.3133,
+      "step": 340
+    },
+    {
+      "epoch": 0.22508038585209003,
+      "grad_norm": 0.15541236102581024,
+      "learning_rate": 0.0002835706462212486,
+      "loss": 0.3203,
+      "step": 350
+    },
+    {
+      "epoch": 0.2315112540192926,
+      "grad_norm": 0.14372068643569946,
+      "learning_rate": 0.00028291347207009854,
+      "loss": 0.3284,
+      "step": 360
+    },
+    {
+      "epoch": 0.2379421221864952,
+      "grad_norm": 0.16621021926403046,
+      "learning_rate": 0.0002822562979189485,
+      "loss": 0.3218,
+      "step": 370
+    },
+    {
+      "epoch": 0.24437299035369775,
+      "grad_norm": 0.1540420651435852,
+      "learning_rate": 0.0002815991237677984,
+      "loss": 0.3117,
+      "step": 380
+    },
+    {
+      "epoch": 0.2508038585209003,
+      "grad_norm": 0.1495533287525177,
+      "learning_rate": 0.0002809419496166484,
+      "loss": 0.318,
+      "step": 390
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "grad_norm": 0.3620932400226593,
+      "learning_rate": 0.00028028477546549835,
+      "loss": 0.3241,
+      "step": 400
+    },
+    {
+      "epoch": 0.2572347266881029,
+      "eval_loss": 0.3119981288909912,
+      "eval_runtime": 842.192,
+      "eval_samples_per_second": 2.375,
+      "eval_steps_per_second": 2.375,
+      "step": 400
+    },
+    {
+      "epoch": 0.26366559485530544,
+      "grad_norm": 0.14988340437412262,
+      "learning_rate": 0.00027962760131434826,
+      "loss": 0.3132,
+      "step": 410
+    },
+    {
+      "epoch": 0.27009646302250806,
+      "grad_norm": 0.14897547662258148,
+      "learning_rate": 0.0002789704271631982,
+      "loss": 0.3159,
+      "step": 420
+    },
+    {
+      "epoch": 0.2765273311897106,
+      "grad_norm": 0.1518355906009674,
+      "learning_rate": 0.0002783132530120482,
+      "loss": 0.3131,
+      "step": 430
+    },
+    {
+      "epoch": 0.2829581993569132,
+      "grad_norm": 0.1667843610048294,
+      "learning_rate": 0.0002776560788608981,
+      "loss": 0.3176,
+      "step": 440
+    },
+    {
+      "epoch": 0.28938906752411575,
+      "grad_norm": 0.1573045551776886,
+      "learning_rate": 0.00027699890470974807,
+      "loss": 0.3063,
+      "step": 450
+    },
+    {
+      "epoch": 0.2958199356913183,
+      "grad_norm": 0.2083781659603119,
+      "learning_rate": 0.000276341730558598,
+      "loss": 0.3121,
+      "step": 460
+    },
+    {
+      "epoch": 0.3022508038585209,
+      "grad_norm": 0.15114179253578186,
+      "learning_rate": 0.00027568455640744795,
+      "loss": 0.3115,
+      "step": 470
+    },
+    {
+      "epoch": 0.3086816720257235,
+      "grad_norm": 0.1676609367132187,
+      "learning_rate": 0.0002750273822562979,
+      "loss": 0.3102,
+      "step": 480
+    },
+    {
+      "epoch": 0.31511254019292606,
+      "grad_norm": 0.1511840969324112,
+      "learning_rate": 0.0002743702081051478,
+      "loss": 0.2958,
+      "step": 490
+    },
+    {
+      "epoch": 0.3215434083601286,
+      "grad_norm": 0.14467953145503998,
+      "learning_rate": 0.0002737130339539978,
+      "loss": 0.3141,
+      "step": 500
+    },
+    {
+      "epoch": 0.3279742765273312,
+      "grad_norm": 0.14991098642349243,
+      "learning_rate": 0.00027305585980284776,
+      "loss": 0.2971,
+      "step": 510
+    },
+    {
+      "epoch": 0.33440514469453375,
+      "grad_norm": 0.15232595801353455,
+      "learning_rate": 0.00027239868565169767,
+      "loss": 0.309,
+      "step": 520
+    },
+    {
+      "epoch": 0.3408360128617363,
+      "grad_norm": 0.14672474563121796,
+      "learning_rate": 0.00027174151150054763,
+      "loss": 0.303,
+      "step": 530
+    },
+    {
+      "epoch": 0.34726688102893893,
+      "grad_norm": 0.1486695259809494,
+      "learning_rate": 0.0002710843373493976,
+      "loss": 0.3049,
+      "step": 540
+    },
+    {
+      "epoch": 0.3536977491961415,
+      "grad_norm": 0.14715538918972015,
+      "learning_rate": 0.0002704271631982475,
+      "loss": 0.2989,
+      "step": 550
+    },
+    {
+      "epoch": 0.36012861736334406,
+      "grad_norm": 0.16407349705696106,
+      "learning_rate": 0.0002697699890470975,
+      "loss": 0.3096,
+      "step": 560
+    },
+    {
+      "epoch": 0.3665594855305466,
+      "grad_norm": 0.17212547361850739,
+      "learning_rate": 0.00026911281489594744,
+      "loss": 0.3071,
+      "step": 570
+    },
+    {
+      "epoch": 0.3729903536977492,
+      "grad_norm": 0.17516419291496277,
+      "learning_rate": 0.00026845564074479735,
+      "loss": 0.3012,
+      "step": 580
+    },
+    {
+      "epoch": 0.37942122186495175,
+      "grad_norm": 0.16375690698623657,
+      "learning_rate": 0.0002677984665936473,
+      "loss": 0.3001,
+      "step": 590
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "grad_norm": 0.16078205406665802,
+      "learning_rate": 0.00026714129244249723,
+      "loss": 0.3033,
+      "step": 600
+    },
+    {
+      "epoch": 0.3858520900321543,
+      "eval_loss": 0.30340561270713806,
+      "eval_runtime": 843.3627,
+      "eval_samples_per_second": 2.371,
+      "eval_steps_per_second": 2.371,
+      "step": 600
+    },
+    {
+      "epoch": 0.39228295819935693,
+      "grad_norm": 0.16159534454345703,
+      "learning_rate": 0.0002664841182913472,
+      "loss": 0.3094,
+      "step": 610
+    },
+    {
+      "epoch": 0.3987138263665595,
+      "grad_norm": 0.16596971452236176,
+      "learning_rate": 0.00026582694414019716,
+      "loss": 0.3006,
+      "step": 620
+    },
+    {
+      "epoch": 0.40514469453376206,
+      "grad_norm": 0.1850331723690033,
+      "learning_rate": 0.00026516976998904707,
+      "loss": 0.3071,
+      "step": 630
+    },
+    {
+      "epoch": 0.4115755627009646,
+      "grad_norm": 0.16145645081996918,
+      "learning_rate": 0.00026451259583789704,
+      "loss": 0.3045,
+      "step": 640
+    },
+    {
+      "epoch": 0.4180064308681672,
+      "grad_norm": 0.1462334543466568,
+      "learning_rate": 0.00026385542168674695,
+      "loss": 0.3127,
+      "step": 650
+    },
+    {
+      "epoch": 0.42443729903536975,
+      "grad_norm": 0.1508970707654953,
+      "learning_rate": 0.0002631982475355969,
+      "loss": 0.3047,
+      "step": 660
+    },
+    {
+      "epoch": 0.43086816720257237,
+      "grad_norm": 0.17199252545833588,
+      "learning_rate": 0.0002625410733844469,
+      "loss": 0.304,
+      "step": 670
+    },
+    {
+      "epoch": 0.43729903536977494,
+      "grad_norm": 0.15578508377075195,
+      "learning_rate": 0.0002618838992332968,
+      "loss": 0.3049,
+      "step": 680
+    },
+    {
+      "epoch": 0.4437299035369775,
+      "grad_norm": 0.17518927156925201,
+      "learning_rate": 0.00026122672508214676,
+      "loss": 0.2975,
+      "step": 690
+    },
+    {
+      "epoch": 0.45016077170418006,
+      "grad_norm": 0.16074487566947937,
+      "learning_rate": 0.0002605695509309967,
+      "loss": 0.2911,
+      "step": 700
+    },
+    {
+      "epoch": 0.4565916398713826,
+      "grad_norm": 0.13804423809051514,
+      "learning_rate": 0.00025991237677984664,
+      "loss": 0.3013,
+      "step": 710
+    },
+    {
+      "epoch": 0.4630225080385852,
+      "grad_norm": 0.15148524940013885,
+      "learning_rate": 0.0002592552026286966,
+      "loss": 0.302,
+      "step": 720
+    },
+    {
+      "epoch": 0.4694533762057878,
+      "grad_norm": 0.18637600541114807,
+      "learning_rate": 0.0002585980284775465,
+      "loss": 0.2945,
+      "step": 730
+    },
+    {
+      "epoch": 0.4758842443729904,
+      "grad_norm": 0.16151027381420135,
+      "learning_rate": 0.0002579408543263965,
+      "loss": 0.2991,
+      "step": 740
+    },
+    {
+      "epoch": 0.48231511254019294,
+      "grad_norm": 0.14255981147289276,
+      "learning_rate": 0.00025728368017524644,
+      "loss": 0.3166,
+      "step": 750
+    },
+    {
+      "epoch": 0.4887459807073955,
+      "grad_norm": 0.14939677715301514,
+      "learning_rate": 0.00025662650602409636,
+      "loss": 0.296,
+      "step": 760
+    },
+    {
+      "epoch": 0.49517684887459806,
+      "grad_norm": 0.15057405829429626,
+      "learning_rate": 0.0002559693318729463,
+      "loss": 0.2827,
+      "step": 770
+    },
+    {
+      "epoch": 0.5016077170418006,
+      "grad_norm": 0.162841796875,
+      "learning_rate": 0.00025531215772179623,
+      "loss": 0.301,
+      "step": 780
+    },
+    {
+      "epoch": 0.5080385852090032,
+      "grad_norm": 0.13532967865467072,
+      "learning_rate": 0.0002546549835706462,
+      "loss": 0.2915,
+      "step": 790
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "grad_norm": 0.14414694905281067,
+      "learning_rate": 0.00025399780941949616,
+      "loss": 0.2963,
+      "step": 800
+    },
+    {
+      "epoch": 0.5144694533762058,
+      "eval_loss": 0.2967182695865631,
+      "eval_runtime": 836.6673,
+      "eval_samples_per_second": 2.39,
+      "eval_steps_per_second": 2.39,
+      "step": 800
+    },
+    {
+      "epoch": 0.5209003215434084,
+      "grad_norm": 0.1773417443037033,
+      "learning_rate": 0.0002533406352683461,
+      "loss": 0.2983,
+      "step": 810
+    },
+    {
+      "epoch": 0.5273311897106109,
+      "grad_norm": 0.15203996002674103,
+      "learning_rate": 0.00025268346111719604,
+      "loss": 0.3116,
+      "step": 820
+    },
+    {
+      "epoch": 0.5337620578778135,
+      "grad_norm": 0.14001955091953278,
+      "learning_rate": 0.00025202628696604595,
+      "loss": 0.2895,
+      "step": 830
+    },
+    {
+      "epoch": 0.5401929260450161,
+      "grad_norm": 0.14643649756908417,
+      "learning_rate": 0.0002513691128148959,
+      "loss": 0.2923,
+      "step": 840
+    },
+    {
+      "epoch": 0.5466237942122186,
+      "grad_norm": 0.1428484320640564,
+      "learning_rate": 0.0002507119386637459,
+      "loss": 0.3001,
+      "step": 850
+    },
+    {
+      "epoch": 0.5530546623794212,
+      "grad_norm": 0.1566481739282608,
+      "learning_rate": 0.0002500547645125958,
+      "loss": 0.3019,
+      "step": 860
+    },
+    {
+      "epoch": 0.5594855305466238,
+      "grad_norm": 0.15484847128391266,
+      "learning_rate": 0.00024939759036144576,
+      "loss": 0.2952,
+      "step": 870
+    },
+    {
+      "epoch": 0.5659163987138264,
+      "grad_norm": 0.16524390876293182,
+      "learning_rate": 0.00024874041621029573,
+      "loss": 0.2956,
+      "step": 880
+    },
+    {
+      "epoch": 0.572347266881029,
+      "grad_norm": 0.16674397885799408,
+      "learning_rate": 0.00024808324205914564,
+      "loss": 0.2917,
+      "step": 890
+    },
+    {
+      "epoch": 0.5787781350482315,
+      "grad_norm": 0.15178845822811127,
+      "learning_rate": 0.0002474260679079956,
+      "loss": 0.2999,
+      "step": 900
+    },
+    {
+      "epoch": 0.5852090032154341,
+      "grad_norm": 0.15306776762008667,
+      "learning_rate": 0.0002467688937568455,
+      "loss": 0.2909,
+      "step": 910
+    },
+    {
+      "epoch": 0.5916398713826366,
+      "grad_norm": 0.16369566321372986,
+      "learning_rate": 0.0002461117196056955,
+      "loss": 0.3041,
+      "step": 920
+    },
+    {
+      "epoch": 0.5980707395498392,
+      "grad_norm": 0.15131248533725739,
+      "learning_rate": 0.00024545454545454545,
+      "loss": 0.2948,
+      "step": 930
+    },
+    {
+      "epoch": 0.6045016077170418,
+      "grad_norm": 0.1472727656364441,
+      "learning_rate": 0.00024479737130339536,
+      "loss": 0.2926,
+      "step": 940
+    },
+    {
+      "epoch": 0.6109324758842444,
+      "grad_norm": 0.17377722263336182,
+      "learning_rate": 0.00024414019715224533,
+      "loss": 0.298,
+      "step": 950
+    },
+    {
+      "epoch": 0.617363344051447,
+      "grad_norm": 0.1630556285381317,
+      "learning_rate": 0.00024348302300109526,
+      "loss": 0.2921,
+      "step": 960
+    },
+    {
+      "epoch": 0.6237942122186495,
+      "grad_norm": 0.1495707631111145,
+      "learning_rate": 0.0002428258488499452,
+      "loss": 0.3057,
+      "step": 970
+    },
+    {
+      "epoch": 0.6302250803858521,
+      "grad_norm": 0.14349451661109924,
+      "learning_rate": 0.00024216867469879517,
+      "loss": 0.2978,
+      "step": 980
+    },
+    {
+      "epoch": 0.6366559485530546,
+      "grad_norm": 0.15608881413936615,
+      "learning_rate": 0.0002415115005476451,
+      "loss": 0.2888,
+      "step": 990
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "grad_norm": 0.15335732698440552,
+      "learning_rate": 0.00024085432639649505,
+      "loss": 0.2968,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6430868167202572,
+      "eval_loss": 0.2915716767311096,
+      "eval_runtime": 833.5453,
+      "eval_samples_per_second": 2.399,
+      "eval_steps_per_second": 2.399,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6495176848874598,
+      "grad_norm": 0.17396800220012665,
+      "learning_rate": 0.00024019715224534498,
+      "loss": 0.285,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6559485530546624,
+      "grad_norm": 0.15287892520427704,
+      "learning_rate": 0.00023953997809419495,
+      "loss": 0.2992,
+      "step": 1020
+    },
+    {
+      "epoch": 0.662379421221865,
+      "grad_norm": 0.15309476852416992,
+      "learning_rate": 0.0002388828039430449,
+      "loss": 0.2927,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6688102893890675,
+      "grad_norm": 0.17866992950439453,
+      "learning_rate": 0.00023822562979189483,
+      "loss": 0.291,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6752411575562701,
+      "grad_norm": 0.1827457696199417,
+      "learning_rate": 0.0002375684556407448,
+      "loss": 0.289,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6816720257234726,
+      "grad_norm": 0.15376009047031403,
+      "learning_rate": 0.0002369112814895947,
+      "loss": 0.2916,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6881028938906752,
+      "grad_norm": 0.16380611062049866,
+      "learning_rate": 0.00023625410733844467,
+      "loss": 0.2894,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6945337620578779,
+      "grad_norm": 0.1561112254858017,
+      "learning_rate": 0.00023559693318729464,
+      "loss": 0.2895,
+      "step": 1080
+    },
+    {
+      "epoch": 0.7009646302250804,
+      "grad_norm": 0.14859697222709656,
+      "learning_rate": 0.00023493975903614455,
+      "loss": 0.2889,
+      "step": 1090
+    },
+    {
+      "epoch": 0.707395498392283,
+      "grad_norm": 0.15627552568912506,
+      "learning_rate": 0.0002342825848849945,
+      "loss": 0.2889,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7138263665594855,
+      "grad_norm": 0.16106264293193817,
+      "learning_rate": 0.00023362541073384445,
+      "loss": 0.2858,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7202572347266881,
+      "grad_norm": 0.1667865663766861,
+      "learning_rate": 0.0002329682365826944,
+      "loss": 0.289,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7266881028938906,
+      "grad_norm": 0.17265114188194275,
+      "learning_rate": 0.00023231106243154436,
+      "loss": 0.2888,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7331189710610932,
+      "grad_norm": 0.16795796155929565,
+      "learning_rate": 0.00023165388828039427,
+      "loss": 0.2824,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7395498392282959,
+      "grad_norm": 0.144576758146286,
+      "learning_rate": 0.00023099671412924423,
+      "loss": 0.2896,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7459807073954984,
+      "grad_norm": 0.15994594991207123,
+      "learning_rate": 0.0002303395399780942,
+      "loss": 0.2969,
+      "step": 1160
+    },
+    {
+      "epoch": 0.752411575562701,
+      "grad_norm": 0.1481965035200119,
+      "learning_rate": 0.0002296823658269441,
+      "loss": 0.2832,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7588424437299035,
+      "grad_norm": 0.143247589468956,
+      "learning_rate": 0.00022902519167579408,
+      "loss": 0.293,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7652733118971061,
+      "grad_norm": 0.17525409162044525,
+      "learning_rate": 0.000228368017524644,
+      "loss": 0.282,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "grad_norm": 0.16322872042655945,
+      "learning_rate": 0.00022771084337349395,
+      "loss": 0.2888,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7717041800643086,
+      "eval_loss": 0.2870965301990509,
+      "eval_runtime": 837.2115,
+      "eval_samples_per_second": 2.389,
+      "eval_steps_per_second": 2.389,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7781350482315113,
+      "grad_norm": 0.1486148089170456,
+      "learning_rate": 0.00022705366922234392,
+      "loss": 0.2648,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7845659163987139,
+      "grad_norm": 0.1699255108833313,
+      "learning_rate": 0.00022639649507119383,
+      "loss": 0.2781,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7909967845659164,
+      "grad_norm": 0.17535263299942017,
+      "learning_rate": 0.0002257393209200438,
+      "loss": 0.258,
+      "step": 1230
+    },
+    {
+      "epoch": 0.797427652733119,
+      "grad_norm": 0.1789318472146988,
+      "learning_rate": 0.00022508214676889373,
+      "loss": 0.2704,
+      "step": 1240
+    },
+    {
+      "epoch": 0.8038585209003215,
+      "grad_norm": 0.1452336609363556,
+      "learning_rate": 0.00022442497261774367,
+      "loss": 0.2644,
+      "step": 1250
+    },
+    {
+      "epoch": 0.8102893890675241,
+      "grad_norm": 0.14961348474025726,
+      "learning_rate": 0.00022376779846659364,
+      "loss": 0.2761,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8167202572347267,
+      "grad_norm": 0.14857113361358643,
+      "learning_rate": 0.00022311062431544358,
+      "loss": 0.2693,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8231511254019293,
+      "grad_norm": 0.172617107629776,
+      "learning_rate": 0.00022245345016429352,
+      "loss": 0.2668,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8295819935691319,
+      "grad_norm": 0.15187814831733704,
+      "learning_rate": 0.00022179627601314345,
+      "loss": 0.2612,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8360128617363344,
+      "grad_norm": 0.15238122642040253,
+      "learning_rate": 0.0002211391018619934,
+      "loss": 0.281,
+      "step": 1300
+    },
+    {
+      "epoch": 0.842443729903537,
+      "grad_norm": 0.15372508764266968,
+      "learning_rate": 0.00022048192771084336,
+      "loss": 0.268,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8488745980707395,
+      "grad_norm": 0.17595216631889343,
+      "learning_rate": 0.0002198247535596933,
+      "loss": 0.2708,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8553054662379421,
+      "grad_norm": 0.17156840860843658,
+      "learning_rate": 0.00021916757940854324,
+      "loss": 0.2781,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8617363344051447,
+      "grad_norm": 0.15553158521652222,
+      "learning_rate": 0.0002185104052573932,
+      "loss": 0.2691,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8681672025723473,
+      "grad_norm": 0.15347564220428467,
+      "learning_rate": 0.00021785323110624314,
+      "loss": 0.2677,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8745980707395499,
+      "grad_norm": 0.15675435960292816,
+      "learning_rate": 0.00021719605695509308,
+      "loss": 0.2745,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8810289389067524,
+      "grad_norm": 0.17261667549610138,
+      "learning_rate": 0.00021653888280394302,
+      "loss": 0.2738,
+      "step": 1370
+    },
+    {
+      "epoch": 0.887459807073955,
+      "grad_norm": 0.17966938018798828,
+      "learning_rate": 0.00021588170865279298,
+      "loss": 0.2626,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8938906752411575,
+      "grad_norm": 0.1592021882534027,
+      "learning_rate": 0.00021522453450164292,
+      "loss": 0.2714,
+      "step": 1390
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "grad_norm": 0.203449547290802,
+      "learning_rate": 0.00021456736035049286,
+      "loss": 0.2677,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9003215434083601,
+      "eval_loss": 0.28547272086143494,
+      "eval_runtime": 841.1547,
+      "eval_samples_per_second": 2.378,
+      "eval_steps_per_second": 2.378,
+      "step": 1400
+    },
+    {
+      "epoch": 0.9067524115755627,
+      "grad_norm": 0.15531207621097565,
+      "learning_rate": 0.00021391018619934283,
+      "loss": 0.27,
+      "step": 1410
+    },
+    {
+      "epoch": 0.9131832797427653,
+      "grad_norm": 0.33703845739364624,
+      "learning_rate": 0.00021325301204819274,
+      "loss": 0.2748,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9196141479099679,
+      "grad_norm": 0.20089037716388702,
+      "learning_rate": 0.0002125958378970427,
+      "loss": 0.2685,
+      "step": 1430
+    },
+    {
+      "epoch": 0.9260450160771704,
+      "grad_norm": 0.1531943380832672,
+      "learning_rate": 0.00021193866374589267,
+      "loss": 0.2729,
+      "step": 1440
+    },
+    {
+      "epoch": 0.932475884244373,
+      "grad_norm": 0.15565109252929688,
+      "learning_rate": 0.00021128148959474258,
+      "loss": 0.2681,
+      "step": 1450
+    },
+    {
+      "epoch": 0.9389067524115756,
+      "grad_norm": 0.18514017760753632,
+      "learning_rate": 0.00021062431544359255,
+      "loss": 0.2761,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9453376205787781,
+      "grad_norm": 0.16498805582523346,
+      "learning_rate": 0.00020996714129244246,
+      "loss": 0.2723,
+      "step": 1470
+    },
+    {
+      "epoch": 0.9517684887459807,
+      "grad_norm": 0.16682648658752441,
+      "learning_rate": 0.00020930996714129242,
+      "loss": 0.2679,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9581993569131833,
+      "grad_norm": 0.16485853493213654,
+      "learning_rate": 0.0002086527929901424,
+      "loss": 0.2649,
+      "step": 1490
+    },
+    {
+      "epoch": 0.9646302250803859,
+      "grad_norm": 0.28759464621543884,
+      "learning_rate": 0.0002079956188389923,
+      "loss": 0.2761,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9710610932475884,
+      "grad_norm": 0.16720062494277954,
+      "learning_rate": 0.00020733844468784227,
+      "loss": 0.2622,
+      "step": 1510
+    },
+    {
+      "epoch": 0.977491961414791,
+      "grad_norm": 0.15708176791667938,
+      "learning_rate": 0.00020668127053669218,
+      "loss": 0.2785,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9839228295819936,
+      "grad_norm": 0.14977172017097473,
+      "learning_rate": 0.00020602409638554214,
+      "loss": 0.2647,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9903536977491961,
+      "grad_norm": 0.15259595215320587,
+      "learning_rate": 0.0002053669222343921,
+      "loss": 0.2612,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9967845659163987,
+      "grad_norm": 0.16366101801395416,
+      "learning_rate": 0.00020470974808324202,
+      "loss": 0.2717,
+      "step": 1550
+    },
+    {
+      "epoch": 1.0032154340836013,
+      "grad_norm": 0.1795603334903717,
+      "learning_rate": 0.000204052573932092,
+      "loss": 0.267,
+      "step": 1560
+    },
+    {
+      "epoch": 1.0096463022508038,
+      "grad_norm": 0.17623689770698547,
+      "learning_rate": 0.00020339539978094195,
+      "loss": 0.252,
+      "step": 1570
+    },
+    {
+      "epoch": 1.0160771704180065,
+      "grad_norm": 0.15394379198551178,
+      "learning_rate": 0.00020273822562979186,
+      "loss": 0.2685,
+      "step": 1580
+    },
+    {
+      "epoch": 1.022508038585209,
+      "grad_norm": 0.16083110868930817,
+      "learning_rate": 0.00020208105147864183,
+      "loss": 0.2573,
+      "step": 1590
+    },
+    {
+      "epoch": 1.0289389067524115,
+      "grad_norm": 0.1885363757610321,
+      "learning_rate": 0.00020142387732749177,
+      "loss": 0.2592,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0289389067524115,
+      "eval_loss": 0.2842000424861908,
+      "eval_runtime": 845.1823,
+      "eval_samples_per_second": 2.366,
+      "eval_steps_per_second": 2.366,
+      "step": 1600
+    },
+    {
+      "epoch": 1.0353697749196142,
+      "grad_norm": 0.1810334175825119,
+      "learning_rate": 0.0002007667031763417,
+      "loss": 0.272,
+      "step": 1610
+    },
+    {
+      "epoch": 1.0418006430868167,
+      "grad_norm": 0.17008474469184875,
+      "learning_rate": 0.00020010952902519167,
+      "loss": 0.2516,
+      "step": 1620
+    },
+    {
+      "epoch": 1.0482315112540193,
+      "grad_norm": 0.1848730593919754,
+      "learning_rate": 0.00019945235487404158,
+      "loss": 0.2627,
+      "step": 1630
+    },
+    {
+      "epoch": 1.0546623794212218,
+      "grad_norm": 0.16504080593585968,
+      "learning_rate": 0.00019879518072289155,
+      "loss": 0.256,
+      "step": 1640
+    },
+    {
+      "epoch": 1.0610932475884245,
+      "grad_norm": 0.166968435049057,
+      "learning_rate": 0.0001981380065717415,
+      "loss": 0.269,
+      "step": 1650
+    },
+    {
+      "epoch": 1.067524115755627,
+      "grad_norm": 0.20997032523155212,
+      "learning_rate": 0.00019748083242059143,
+      "loss": 0.2641,
+      "step": 1660
+    },
+    {
+      "epoch": 1.0739549839228295,
+      "grad_norm": 0.18889103829860687,
+      "learning_rate": 0.0001968236582694414,
+      "loss": 0.2499,
+      "step": 1670
+    },
+    {
+      "epoch": 1.0803858520900322,
+      "grad_norm": 0.15986306965351105,
+      "learning_rate": 0.00019616648411829133,
+      "loss": 0.2659,
+      "step": 1680
+    },
+    {
+      "epoch": 1.0868167202572347,
+      "grad_norm": 0.15300147235393524,
+      "learning_rate": 0.00019550930996714127,
+      "loss": 0.2649,
+      "step": 1690
+    },
+    {
+      "epoch": 1.0932475884244373,
+      "grad_norm": 0.16770875453948975,
+      "learning_rate": 0.0001948521358159912,
+      "loss": 0.265,
+      "step": 1700
+    },
+    {
+      "epoch": 1.09967845659164,
+      "grad_norm": 0.15347127616405487,
+      "learning_rate": 0.00019419496166484117,
+      "loss": 0.2629,
+      "step": 1710
+    },
+    {
+      "epoch": 1.1061093247588425,
+      "grad_norm": 0.1624882072210312,
+      "learning_rate": 0.0001935377875136911,
+      "loss": 0.2651,
+      "step": 1720
+    },
+    {
+      "epoch": 1.112540192926045,
+      "grad_norm": 0.17137973010540009,
+      "learning_rate": 0.00019288061336254105,
+      "loss": 0.2564,
+      "step": 1730
+    },
+    {
+      "epoch": 1.1189710610932475,
+      "grad_norm": 0.16637948155403137,
+      "learning_rate": 0.00019222343921139102,
+      "loss": 0.2642,
+      "step": 1740
+    },
+    {
+      "epoch": 1.1254019292604502,
+      "grad_norm": 0.1559745818376541,
+      "learning_rate": 0.00019156626506024093,
+      "loss": 0.2679,
+      "step": 1750
+    },
+    {
+      "epoch": 1.1318327974276527,
+      "grad_norm": 0.1712270975112915,
+      "learning_rate": 0.0001909090909090909,
+      "loss": 0.2611,
+      "step": 1760
+    },
+    {
+      "epoch": 1.1382636655948553,
+      "grad_norm": 0.18316815793514252,
+      "learning_rate": 0.00019025191675794086,
+      "loss": 0.2661,
+      "step": 1770
+    },
+    {
+      "epoch": 1.144694533762058,
+      "grad_norm": 0.16947688162326813,
+      "learning_rate": 0.00018959474260679077,
+      "loss": 0.2635,
+      "step": 1780
+    },
+    {
+      "epoch": 1.1511254019292605,
+      "grad_norm": 0.16996344923973083,
+      "learning_rate": 0.00018893756845564074,
+      "loss": 0.2612,
+      "step": 1790
+    },
+    {
+      "epoch": 1.157556270096463,
+      "grad_norm": 0.16218321025371552,
+      "learning_rate": 0.00018828039430449068,
+      "loss": 0.2614,
+      "step": 1800
+    },
+    {
+      "epoch": 1.157556270096463,
+      "eval_loss": 0.2809600234031677,
+      "eval_runtime": 829.6019,
+      "eval_samples_per_second": 2.411,
+      "eval_steps_per_second": 2.411,
+      "step": 1800
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 4665,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 3,
+  "save_steps": 200,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 2.64707486237098e+18,
+  "train_batch_size": 1,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:29b7e03fe0a8c61c30c12490f2820039fd437f04fc4a9e0f5deb6ad1472e680f
+size 5048