theekshana commited on
Commit
8c47343
·
verified ·
1 Parent(s): 29d22f6

End of training

Browse files
README.md ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: led-large-annual-report-QLoRA-fine-tuned-v0.9-merged
3
+ library_name: peft
4
+ tags:
5
+ - generated_from_trainer
6
+ model-index:
7
+ - name: led-large-annual-report-QLoRA-fine-tuned-v0.9.7-openai
8
+ results: []
9
+ ---
10
+
11
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
12
+ should probably proofread and complete it, then remove this comment. -->
13
+
14
+ # led-large-annual-report-QLoRA-fine-tuned-v0.9.7-openai
15
+
16
+ This model was trained from scratch on an unknown dataset.
17
+ It achieves the following results on the evaluation set:
18
+ - eval_loss: 1.5254
19
+ - eval_runtime: 168.5711
20
+ - eval_samples_per_second: 0.308
21
+ - eval_steps_per_second: 0.154
22
+ - epoch: 2.028
23
+ - step: 1014
24
+
25
+ ## Model description
26
+
27
+ More information needed
28
+
29
+ ## Intended uses & limitations
30
+
31
+ More information needed
32
+
33
+ ## Training and evaluation data
34
+
35
+ More information needed
36
+
37
+ ## Training procedure
38
+
39
+ ### Training hyperparameters
40
+
41
+ The following hyperparameters were used during training:
42
+ - learning_rate: 0.0002
43
+ - train_batch_size: 2
44
+ - eval_batch_size: 2
45
+ - seed: 42
46
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
47
+ - lr_scheduler_type: linear
48
+ - lr_scheduler_warmup_steps: 2
49
+ - num_epochs: 3
50
+ - mixed_precision_training: Native AMP
51
+
52
+ ### Framework versions
53
+
54
+ - PEFT 0.10.0
55
+ - Transformers 4.44.2
56
+ - Pytorch 2.4.1+cu124
57
+ - Datasets 2.14.5
58
+ - Tokenizers 0.19.1
adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "led-large-annual-report-QLoRA-fine-tuned-v0.9-merged",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 128,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 256,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "k_proj",
25
+ "v_proj",
26
+ "dense"
27
+ ],
28
+ "task_type": "SEQ_2_SEQ_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:32550fb50b1c5fab82c2886a4bbf30da70fde235986865bb1ab1caa8222ed14f
3
+ size 151015600
added_tokens.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "<|im_end|>": 50266,
3
+ "<|im_start|>": 50265
4
+ }
logs/events.out.tfevents.1726084170.ANUVATHAN-IO.25620.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a8909d585741a2da9ca9a530cf1b2ce4338fc40d4cd2185dd11c26beff30314
3
+ size 60177
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
special_tokens_map.json ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>"
5
+ ],
6
+ "bos_token": {
7
+ "content": "<|im_start|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false
12
+ },
13
+ "cls_token": {
14
+ "content": "<s>",
15
+ "lstrip": false,
16
+ "normalized": true,
17
+ "rstrip": false,
18
+ "single_word": false
19
+ },
20
+ "eos_token": {
21
+ "content": "<|im_end|>",
22
+ "lstrip": false,
23
+ "normalized": false,
24
+ "rstrip": false,
25
+ "single_word": false
26
+ },
27
+ "mask_token": {
28
+ "content": "<mask>",
29
+ "lstrip": true,
30
+ "normalized": true,
31
+ "rstrip": false,
32
+ "single_word": false
33
+ },
34
+ "pad_token": "<|im_end|>",
35
+ "sep_token": {
36
+ "content": "</s>",
37
+ "lstrip": false,
38
+ "normalized": true,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ },
42
+ "unk_token": {
43
+ "content": "<unk>",
44
+ "lstrip": false,
45
+ "normalized": true,
46
+ "rstrip": false,
47
+ "single_word": false
48
+ }
49
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": true,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<s>",
8
+ "lstrip": false,
9
+ "normalized": true,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<pad>",
16
+ "lstrip": false,
17
+ "normalized": true,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": true,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "3": {
31
+ "content": "<unk>",
32
+ "lstrip": false,
33
+ "normalized": true,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "50264": {
39
+ "content": "<mask>",
40
+ "lstrip": true,
41
+ "normalized": true,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "50265": {
47
+ "content": "<|im_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "50266": {
55
+ "content": "<|im_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ }
62
+ },
63
+ "additional_special_tokens": [
64
+ "<|im_start|>",
65
+ "<|im_end|>"
66
+ ],
67
+ "bos_token": "<|im_start|>",
68
+ "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
69
+ "clean_up_tokenization_spaces": true,
70
+ "cls_token": "<s>",
71
+ "eos_token": "<|im_end|>",
72
+ "errors": "replace",
73
+ "mask_token": "<mask>",
74
+ "max_length": 1024,
75
+ "model_max_length": 16384,
76
+ "pad_to_multiple_of": null,
77
+ "pad_token": "<|im_end|>",
78
+ "pad_token_type_id": 0,
79
+ "padding_side": "right",
80
+ "sep_token": "</s>",
81
+ "stride": 0,
82
+ "tokenizer_class": "LEDTokenizer",
83
+ "trim_offsets": true,
84
+ "truncation_side": "right",
85
+ "truncation_strategy": "longest_first",
86
+ "unk_token": "<unk>"
87
+ }
trainer_state.json ADDED
@@ -0,0 +1,1775 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": 1.5205299854278564,
3
+ "best_model_checkpoint": "led-large-annual-report-QLoRA-fine-tuned-v0.9.5-openai\\checkpoint-975",
4
+ "epoch": 2.028,
5
+ "eval_steps": 25,
6
+ "global_step": 1014,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.01,
13
+ "grad_norm": 0.5776566863059998,
14
+ "learning_rate": 0.00019959946595460615,
15
+ "loss": 2.9761,
16
+ "step": 5
17
+ },
18
+ {
19
+ "epoch": 0.02,
20
+ "grad_norm": 0.7108611464500427,
21
+ "learning_rate": 0.00019893190921228305,
22
+ "loss": 2.5028,
23
+ "step": 10
24
+ },
25
+ {
26
+ "epoch": 0.03,
27
+ "grad_norm": 0.6665928959846497,
28
+ "learning_rate": 0.0001983978638184246,
29
+ "loss": 2.7385,
30
+ "step": 15
31
+ },
32
+ {
33
+ "epoch": 0.04,
34
+ "grad_norm": 0.6933648586273193,
35
+ "learning_rate": 0.00019773030707610146,
36
+ "loss": 2.5519,
37
+ "step": 20
38
+ },
39
+ {
40
+ "epoch": 0.05,
41
+ "grad_norm": 0.6314701437950134,
42
+ "learning_rate": 0.0001970627503337784,
43
+ "loss": 2.2413,
44
+ "step": 25
45
+ },
46
+ {
47
+ "epoch": 0.05,
48
+ "eval_loss": 2.0085484981536865,
49
+ "eval_runtime": 157.7525,
50
+ "eval_samples_per_second": 0.33,
51
+ "eval_steps_per_second": 0.165,
52
+ "step": 25
53
+ },
54
+ {
55
+ "epoch": 0.06,
56
+ "grad_norm": 2.063861131668091,
57
+ "learning_rate": 0.0001963951935914553,
58
+ "loss": 2.6807,
59
+ "step": 30
60
+ },
61
+ {
62
+ "epoch": 0.07,
63
+ "grad_norm": 0.7584943175315857,
64
+ "learning_rate": 0.0001957276368491322,
65
+ "loss": 2.1435,
66
+ "step": 35
67
+ },
68
+ {
69
+ "epoch": 0.08,
70
+ "grad_norm": 0.650688886642456,
71
+ "learning_rate": 0.0001950600801068091,
72
+ "loss": 2.0311,
73
+ "step": 40
74
+ },
75
+ {
76
+ "epoch": 0.09,
77
+ "grad_norm": 0.6848713159561157,
78
+ "learning_rate": 0.000194392523364486,
79
+ "loss": 2.0928,
80
+ "step": 45
81
+ },
82
+ {
83
+ "epoch": 0.1,
84
+ "grad_norm": 0.915006697177887,
85
+ "learning_rate": 0.0001937249666221629,
86
+ "loss": 2.2454,
87
+ "step": 50
88
+ },
89
+ {
90
+ "epoch": 0.1,
91
+ "eval_loss": 1.8175970315933228,
92
+ "eval_runtime": 160.1948,
93
+ "eval_samples_per_second": 0.325,
94
+ "eval_steps_per_second": 0.162,
95
+ "step": 50
96
+ },
97
+ {
98
+ "epoch": 0.11,
99
+ "grad_norm": 0.63294917345047,
100
+ "learning_rate": 0.0001930574098798398,
101
+ "loss": 1.8694,
102
+ "step": 55
103
+ },
104
+ {
105
+ "epoch": 0.12,
106
+ "grad_norm": 0.6055558323860168,
107
+ "learning_rate": 0.0001923898531375167,
108
+ "loss": 2.0452,
109
+ "step": 60
110
+ },
111
+ {
112
+ "epoch": 0.13,
113
+ "grad_norm": 0.7194655537605286,
114
+ "learning_rate": 0.0001917222963951936,
115
+ "loss": 1.9801,
116
+ "step": 65
117
+ },
118
+ {
119
+ "epoch": 0.14,
120
+ "grad_norm": 0.7426390051841736,
121
+ "learning_rate": 0.00019105473965287052,
122
+ "loss": 1.9021,
123
+ "step": 70
124
+ },
125
+ {
126
+ "epoch": 0.15,
127
+ "grad_norm": 1.0124174356460571,
128
+ "learning_rate": 0.0001903871829105474,
129
+ "loss": 1.8887,
130
+ "step": 75
131
+ },
132
+ {
133
+ "epoch": 0.15,
134
+ "eval_loss": 1.7658432722091675,
135
+ "eval_runtime": 157.6865,
136
+ "eval_samples_per_second": 0.33,
137
+ "eval_steps_per_second": 0.165,
138
+ "step": 75
139
+ },
140
+ {
141
+ "epoch": 0.16,
142
+ "grad_norm": 0.6850053071975708,
143
+ "learning_rate": 0.00018971962616822432,
144
+ "loss": 1.9158,
145
+ "step": 80
146
+ },
147
+ {
148
+ "epoch": 0.17,
149
+ "grad_norm": 0.9443833827972412,
150
+ "learning_rate": 0.00018905206942590122,
151
+ "loss": 1.9566,
152
+ "step": 85
153
+ },
154
+ {
155
+ "epoch": 0.18,
156
+ "grad_norm": 0.9081319570541382,
157
+ "learning_rate": 0.00018838451268357812,
158
+ "loss": 1.6911,
159
+ "step": 90
160
+ },
161
+ {
162
+ "epoch": 0.19,
163
+ "grad_norm": 1.0233992338180542,
164
+ "learning_rate": 0.00018771695594125502,
165
+ "loss": 1.9792,
166
+ "step": 95
167
+ },
168
+ {
169
+ "epoch": 0.2,
170
+ "grad_norm": 0.8909695148468018,
171
+ "learning_rate": 0.00018704939919893192,
172
+ "loss": 2.096,
173
+ "step": 100
174
+ },
175
+ {
176
+ "epoch": 0.2,
177
+ "eval_loss": 1.723463773727417,
178
+ "eval_runtime": 155.3531,
179
+ "eval_samples_per_second": 0.335,
180
+ "eval_steps_per_second": 0.167,
181
+ "step": 100
182
+ },
183
+ {
184
+ "epoch": 0.21,
185
+ "grad_norm": 0.6819004416465759,
186
+ "learning_rate": 0.00018638184245660882,
187
+ "loss": 1.9699,
188
+ "step": 105
189
+ },
190
+ {
191
+ "epoch": 0.22,
192
+ "grad_norm": 1.0499932765960693,
193
+ "learning_rate": 0.00018571428571428572,
194
+ "loss": 1.4622,
195
+ "step": 110
196
+ },
197
+ {
198
+ "epoch": 0.23,
199
+ "grad_norm": 1.1806385517120361,
200
+ "learning_rate": 0.00018504672897196262,
201
+ "loss": 2.0254,
202
+ "step": 115
203
+ },
204
+ {
205
+ "epoch": 0.24,
206
+ "grad_norm": 0.8019367456436157,
207
+ "learning_rate": 0.00018437917222963952,
208
+ "loss": 1.7028,
209
+ "step": 120
210
+ },
211
+ {
212
+ "epoch": 0.25,
213
+ "grad_norm": 0.5862939953804016,
214
+ "learning_rate": 0.00018371161548731644,
215
+ "loss": 1.8961,
216
+ "step": 125
217
+ },
218
+ {
219
+ "epoch": 0.25,
220
+ "eval_loss": 1.6857578754425049,
221
+ "eval_runtime": 155.2512,
222
+ "eval_samples_per_second": 0.335,
223
+ "eval_steps_per_second": 0.167,
224
+ "step": 125
225
+ },
226
+ {
227
+ "epoch": 0.26,
228
+ "grad_norm": 0.8203465342521667,
229
+ "learning_rate": 0.00018304405874499332,
230
+ "loss": 1.695,
231
+ "step": 130
232
+ },
233
+ {
234
+ "epoch": 0.27,
235
+ "grad_norm": 0.9199301600456238,
236
+ "learning_rate": 0.00018237650200267024,
237
+ "loss": 1.6875,
238
+ "step": 135
239
+ },
240
+ {
241
+ "epoch": 0.28,
242
+ "grad_norm": 1.0262317657470703,
243
+ "learning_rate": 0.00018170894526034714,
244
+ "loss": 1.9771,
245
+ "step": 140
246
+ },
247
+ {
248
+ "epoch": 0.29,
249
+ "grad_norm": 0.6566195487976074,
250
+ "learning_rate": 0.00018104138851802404,
251
+ "loss": 1.794,
252
+ "step": 145
253
+ },
254
+ {
255
+ "epoch": 0.3,
256
+ "grad_norm": 0.6329429745674133,
257
+ "learning_rate": 0.00018037383177570094,
258
+ "loss": 1.4936,
259
+ "step": 150
260
+ },
261
+ {
262
+ "epoch": 0.3,
263
+ "eval_loss": 1.6933786869049072,
264
+ "eval_runtime": 154.938,
265
+ "eval_samples_per_second": 0.336,
266
+ "eval_steps_per_second": 0.168,
267
+ "step": 150
268
+ },
269
+ {
270
+ "epoch": 0.31,
271
+ "grad_norm": 1.1340160369873047,
272
+ "learning_rate": 0.00017970627503337784,
273
+ "loss": 1.9533,
274
+ "step": 155
275
+ },
276
+ {
277
+ "epoch": 0.32,
278
+ "grad_norm": 0.8708927631378174,
279
+ "learning_rate": 0.00017903871829105474,
280
+ "loss": 1.8068,
281
+ "step": 160
282
+ },
283
+ {
284
+ "epoch": 0.33,
285
+ "grad_norm": 0.7619530558586121,
286
+ "learning_rate": 0.00017837116154873167,
287
+ "loss": 1.6662,
288
+ "step": 165
289
+ },
290
+ {
291
+ "epoch": 0.34,
292
+ "grad_norm": 0.6751542687416077,
293
+ "learning_rate": 0.00017770360480640854,
294
+ "loss": 1.7322,
295
+ "step": 170
296
+ },
297
+ {
298
+ "epoch": 0.35,
299
+ "grad_norm": 1.106812596321106,
300
+ "learning_rate": 0.00017703604806408544,
301
+ "loss": 1.9186,
302
+ "step": 175
303
+ },
304
+ {
305
+ "epoch": 0.35,
306
+ "eval_loss": 1.7012258768081665,
307
+ "eval_runtime": 155.0194,
308
+ "eval_samples_per_second": 0.335,
309
+ "eval_steps_per_second": 0.168,
310
+ "step": 175
311
+ },
312
+ {
313
+ "epoch": 0.36,
314
+ "grad_norm": 0.7777801752090454,
315
+ "learning_rate": 0.00017636849132176237,
316
+ "loss": 1.7517,
317
+ "step": 180
318
+ },
319
+ {
320
+ "epoch": 0.37,
321
+ "grad_norm": 2.1003479957580566,
322
+ "learning_rate": 0.00017570093457943927,
323
+ "loss": 1.8307,
324
+ "step": 185
325
+ },
326
+ {
327
+ "epoch": 0.38,
328
+ "grad_norm": 0.6046275496482849,
329
+ "learning_rate": 0.00017503337783711614,
330
+ "loss": 1.5722,
331
+ "step": 190
332
+ },
333
+ {
334
+ "epoch": 0.39,
335
+ "grad_norm": 0.6874721050262451,
336
+ "learning_rate": 0.00017436582109479307,
337
+ "loss": 1.5634,
338
+ "step": 195
339
+ },
340
+ {
341
+ "epoch": 0.4,
342
+ "grad_norm": 0.6428207159042358,
343
+ "learning_rate": 0.00017369826435246997,
344
+ "loss": 1.8532,
345
+ "step": 200
346
+ },
347
+ {
348
+ "epoch": 0.4,
349
+ "eval_loss": 1.669812560081482,
350
+ "eval_runtime": 154.6164,
351
+ "eval_samples_per_second": 0.336,
352
+ "eval_steps_per_second": 0.168,
353
+ "step": 200
354
+ },
355
+ {
356
+ "epoch": 0.41,
357
+ "grad_norm": 0.7773897051811218,
358
+ "learning_rate": 0.00017303070761014687,
359
+ "loss": 1.591,
360
+ "step": 205
361
+ },
362
+ {
363
+ "epoch": 0.42,
364
+ "grad_norm": 0.6522992849349976,
365
+ "learning_rate": 0.00017236315086782377,
366
+ "loss": 1.6156,
367
+ "step": 210
368
+ },
369
+ {
370
+ "epoch": 0.43,
371
+ "grad_norm": 0.6386374831199646,
372
+ "learning_rate": 0.00017169559412550067,
373
+ "loss": 1.5089,
374
+ "step": 215
375
+ },
376
+ {
377
+ "epoch": 0.44,
378
+ "grad_norm": 0.5616068840026855,
379
+ "learning_rate": 0.0001710280373831776,
380
+ "loss": 1.3961,
381
+ "step": 220
382
+ },
383
+ {
384
+ "epoch": 0.45,
385
+ "grad_norm": 0.6595784425735474,
386
+ "learning_rate": 0.00017036048064085447,
387
+ "loss": 1.6172,
388
+ "step": 225
389
+ },
390
+ {
391
+ "epoch": 0.45,
392
+ "eval_loss": 1.6555695533752441,
393
+ "eval_runtime": 154.8952,
394
+ "eval_samples_per_second": 0.336,
395
+ "eval_steps_per_second": 0.168,
396
+ "step": 225
397
+ },
398
+ {
399
+ "epoch": 0.46,
400
+ "grad_norm": 0.5686245560646057,
401
+ "learning_rate": 0.00016969292389853137,
402
+ "loss": 1.7398,
403
+ "step": 230
404
+ },
405
+ {
406
+ "epoch": 0.47,
407
+ "grad_norm": 0.5429339408874512,
408
+ "learning_rate": 0.0001690253671562083,
409
+ "loss": 1.7083,
410
+ "step": 235
411
+ },
412
+ {
413
+ "epoch": 0.48,
414
+ "grad_norm": 0.5100717544555664,
415
+ "learning_rate": 0.0001683578104138852,
416
+ "loss": 1.803,
417
+ "step": 240
418
+ },
419
+ {
420
+ "epoch": 0.49,
421
+ "grad_norm": 0.8772192597389221,
422
+ "learning_rate": 0.00016769025367156207,
423
+ "loss": 1.7454,
424
+ "step": 245
425
+ },
426
+ {
427
+ "epoch": 0.5,
428
+ "grad_norm": 0.696287989616394,
429
+ "learning_rate": 0.000167022696929239,
430
+ "loss": 1.7838,
431
+ "step": 250
432
+ },
433
+ {
434
+ "epoch": 0.5,
435
+ "eval_loss": 1.6382060050964355,
436
+ "eval_runtime": 154.9708,
437
+ "eval_samples_per_second": 0.336,
438
+ "eval_steps_per_second": 0.168,
439
+ "step": 250
440
+ },
441
+ {
442
+ "epoch": 0.51,
443
+ "grad_norm": 0.7773283123970032,
444
+ "learning_rate": 0.0001663551401869159,
445
+ "loss": 1.7747,
446
+ "step": 255
447
+ },
448
+ {
449
+ "epoch": 0.52,
450
+ "grad_norm": 0.6426253318786621,
451
+ "learning_rate": 0.0001656875834445928,
452
+ "loss": 1.6671,
453
+ "step": 260
454
+ },
455
+ {
456
+ "epoch": 0.53,
457
+ "grad_norm": 0.5842812061309814,
458
+ "learning_rate": 0.0001650200267022697,
459
+ "loss": 1.8503,
460
+ "step": 265
461
+ },
462
+ {
463
+ "epoch": 0.54,
464
+ "grad_norm": 0.5060431361198425,
465
+ "learning_rate": 0.0001643524699599466,
466
+ "loss": 1.9078,
467
+ "step": 270
468
+ },
469
+ {
470
+ "epoch": 0.55,
471
+ "grad_norm": 0.7145740985870361,
472
+ "learning_rate": 0.00016368491321762352,
473
+ "loss": 2.0422,
474
+ "step": 275
475
+ },
476
+ {
477
+ "epoch": 0.55,
478
+ "eval_loss": 1.6412526369094849,
479
+ "eval_runtime": 154.8473,
480
+ "eval_samples_per_second": 0.336,
481
+ "eval_steps_per_second": 0.168,
482
+ "step": 275
483
+ },
484
+ {
485
+ "epoch": 0.56,
486
+ "grad_norm": 0.6752652525901794,
487
+ "learning_rate": 0.0001630173564753004,
488
+ "loss": 1.6336,
489
+ "step": 280
490
+ },
491
+ {
492
+ "epoch": 0.57,
493
+ "grad_norm": 0.7111806869506836,
494
+ "learning_rate": 0.0001623497997329773,
495
+ "loss": 1.6739,
496
+ "step": 285
497
+ },
498
+ {
499
+ "epoch": 0.58,
500
+ "grad_norm": 0.6902967691421509,
501
+ "learning_rate": 0.00016168224299065422,
502
+ "loss": 1.7994,
503
+ "step": 290
504
+ },
505
+ {
506
+ "epoch": 0.59,
507
+ "grad_norm": 0.5196375250816345,
508
+ "learning_rate": 0.00016101468624833112,
509
+ "loss": 1.5847,
510
+ "step": 295
511
+ },
512
+ {
513
+ "epoch": 0.6,
514
+ "grad_norm": 0.9794080853462219,
515
+ "learning_rate": 0.000160347129506008,
516
+ "loss": 1.8875,
517
+ "step": 300
518
+ },
519
+ {
520
+ "epoch": 0.6,
521
+ "eval_loss": 1.6254016160964966,
522
+ "eval_runtime": 154.7578,
523
+ "eval_samples_per_second": 0.336,
524
+ "eval_steps_per_second": 0.168,
525
+ "step": 300
526
+ },
527
+ {
528
+ "epoch": 0.61,
529
+ "grad_norm": 0.5826653242111206,
530
+ "learning_rate": 0.00015967957276368492,
531
+ "loss": 1.6773,
532
+ "step": 305
533
+ },
534
+ {
535
+ "epoch": 0.62,
536
+ "grad_norm": 0.5551023483276367,
537
+ "learning_rate": 0.00015901201602136182,
538
+ "loss": 1.685,
539
+ "step": 310
540
+ },
541
+ {
542
+ "epoch": 0.63,
543
+ "grad_norm": 0.6069286465644836,
544
+ "learning_rate": 0.00015834445927903872,
545
+ "loss": 1.7293,
546
+ "step": 315
547
+ },
548
+ {
549
+ "epoch": 0.64,
550
+ "grad_norm": 0.8002230525016785,
551
+ "learning_rate": 0.00015767690253671562,
552
+ "loss": 1.8484,
553
+ "step": 320
554
+ },
555
+ {
556
+ "epoch": 0.65,
557
+ "grad_norm": 0.7304033041000366,
558
+ "learning_rate": 0.00015700934579439252,
559
+ "loss": 1.687,
560
+ "step": 325
561
+ },
562
+ {
563
+ "epoch": 0.65,
564
+ "eval_loss": 1.605177402496338,
565
+ "eval_runtime": 154.868,
566
+ "eval_samples_per_second": 0.336,
567
+ "eval_steps_per_second": 0.168,
568
+ "step": 325
569
+ },
570
+ {
571
+ "epoch": 0.66,
572
+ "grad_norm": 0.46438586711883545,
573
+ "learning_rate": 0.00015634178905206945,
574
+ "loss": 1.5457,
575
+ "step": 330
576
+ },
577
+ {
578
+ "epoch": 0.67,
579
+ "grad_norm": 1.024244785308838,
580
+ "learning_rate": 0.00015567423230974635,
581
+ "loss": 1.6972,
582
+ "step": 335
583
+ },
584
+ {
585
+ "epoch": 0.68,
586
+ "grad_norm": 0.506373941898346,
587
+ "learning_rate": 0.00015500667556742322,
588
+ "loss": 1.4549,
589
+ "step": 340
590
+ },
591
+ {
592
+ "epoch": 0.69,
593
+ "grad_norm": 0.7029207944869995,
594
+ "learning_rate": 0.00015433911882510015,
595
+ "loss": 1.6963,
596
+ "step": 345
597
+ },
598
+ {
599
+ "epoch": 0.7,
600
+ "grad_norm": 0.5588846802711487,
601
+ "learning_rate": 0.00015367156208277705,
602
+ "loss": 1.6778,
603
+ "step": 350
604
+ },
605
+ {
606
+ "epoch": 0.7,
607
+ "eval_loss": 1.6053338050842285,
608
+ "eval_runtime": 154.828,
609
+ "eval_samples_per_second": 0.336,
610
+ "eval_steps_per_second": 0.168,
611
+ "step": 350
612
+ },
613
+ {
614
+ "epoch": 0.71,
615
+ "grad_norm": 0.5997027158737183,
616
+ "learning_rate": 0.00015300400534045395,
617
+ "loss": 1.5144,
618
+ "step": 355
619
+ },
620
+ {
621
+ "epoch": 0.72,
622
+ "grad_norm": 0.5491402745246887,
623
+ "learning_rate": 0.00015233644859813085,
624
+ "loss": 1.6086,
625
+ "step": 360
626
+ },
627
+ {
628
+ "epoch": 0.73,
629
+ "grad_norm": 0.6617192625999451,
630
+ "learning_rate": 0.00015166889185580775,
631
+ "loss": 1.5582,
632
+ "step": 365
633
+ },
634
+ {
635
+ "epoch": 0.74,
636
+ "grad_norm": 0.6810077428817749,
637
+ "learning_rate": 0.00015100133511348465,
638
+ "loss": 1.7498,
639
+ "step": 370
640
+ },
641
+ {
642
+ "epoch": 0.75,
643
+ "grad_norm": 0.6304115653038025,
644
+ "learning_rate": 0.00015033377837116155,
645
+ "loss": 1.5276,
646
+ "step": 375
647
+ },
648
+ {
649
+ "epoch": 0.75,
650
+ "eval_loss": 1.5960314273834229,
651
+ "eval_runtime": 154.8615,
652
+ "eval_samples_per_second": 0.336,
653
+ "eval_steps_per_second": 0.168,
654
+ "step": 375
655
+ },
656
+ {
657
+ "epoch": 0.76,
658
+ "grad_norm": 0.6610834002494812,
659
+ "learning_rate": 0.00014966622162883845,
660
+ "loss": 1.5969,
661
+ "step": 380
662
+ },
663
+ {
664
+ "epoch": 0.77,
665
+ "grad_norm": 0.8158746957778931,
666
+ "learning_rate": 0.00014899866488651538,
667
+ "loss": 1.5529,
668
+ "step": 385
669
+ },
670
+ {
671
+ "epoch": 0.78,
672
+ "grad_norm": 0.519496500492096,
673
+ "learning_rate": 0.00014833110814419228,
674
+ "loss": 1.6754,
675
+ "step": 390
676
+ },
677
+ {
678
+ "epoch": 0.79,
679
+ "grad_norm": 0.6644089221954346,
680
+ "learning_rate": 0.00014766355140186915,
681
+ "loss": 1.7254,
682
+ "step": 395
683
+ },
684
+ {
685
+ "epoch": 0.8,
686
+ "grad_norm": 0.7224368453025818,
687
+ "learning_rate": 0.00014699599465954608,
688
+ "loss": 1.4391,
689
+ "step": 400
690
+ },
691
+ {
692
+ "epoch": 0.8,
693
+ "eval_loss": 1.5978156328201294,
694
+ "eval_runtime": 153.9476,
695
+ "eval_samples_per_second": 0.338,
696
+ "eval_steps_per_second": 0.169,
697
+ "step": 400
698
+ },
699
+ {
700
+ "epoch": 0.81,
701
+ "grad_norm": 0.6566179394721985,
702
+ "learning_rate": 0.00014632843791722298,
703
+ "loss": 1.5994,
704
+ "step": 405
705
+ },
706
+ {
707
+ "epoch": 0.82,
708
+ "grad_norm": 1.0222816467285156,
709
+ "learning_rate": 0.00014566088117489988,
710
+ "loss": 1.4374,
711
+ "step": 410
712
+ },
713
+ {
714
+ "epoch": 0.83,
715
+ "grad_norm": 0.5965130925178528,
716
+ "learning_rate": 0.00014499332443257678,
717
+ "loss": 1.5863,
718
+ "step": 415
719
+ },
720
+ {
721
+ "epoch": 0.84,
722
+ "grad_norm": 0.5560926795005798,
723
+ "learning_rate": 0.00014432576769025368,
724
+ "loss": 1.5581,
725
+ "step": 420
726
+ },
727
+ {
728
+ "epoch": 0.85,
729
+ "grad_norm": 0.7057550549507141,
730
+ "learning_rate": 0.00014365821094793058,
731
+ "loss": 1.7263,
732
+ "step": 425
733
+ },
734
+ {
735
+ "epoch": 0.85,
736
+ "eval_loss": 1.6051459312438965,
737
+ "eval_runtime": 153.7143,
738
+ "eval_samples_per_second": 0.338,
739
+ "eval_steps_per_second": 0.169,
740
+ "step": 425
741
+ },
742
+ {
743
+ "epoch": 0.86,
744
+ "grad_norm": 1.132186770439148,
745
+ "learning_rate": 0.00014299065420560748,
746
+ "loss": 1.6684,
747
+ "step": 430
748
+ },
749
+ {
750
+ "epoch": 0.87,
751
+ "grad_norm": 0.5418440103530884,
752
+ "learning_rate": 0.00014232309746328438,
753
+ "loss": 1.6537,
754
+ "step": 435
755
+ },
756
+ {
757
+ "epoch": 0.88,
758
+ "grad_norm": 0.7945486903190613,
759
+ "learning_rate": 0.0001416555407209613,
760
+ "loss": 1.5685,
761
+ "step": 440
762
+ },
763
+ {
764
+ "epoch": 0.89,
765
+ "grad_norm": 0.6951822638511658,
766
+ "learning_rate": 0.0001409879839786382,
767
+ "loss": 1.2469,
768
+ "step": 445
769
+ },
770
+ {
771
+ "epoch": 0.9,
772
+ "grad_norm": 0.6232516765594482,
773
+ "learning_rate": 0.00014032042723631508,
774
+ "loss": 1.6128,
775
+ "step": 450
776
+ },
777
+ {
778
+ "epoch": 0.9,
779
+ "eval_loss": 1.5824800729751587,
780
+ "eval_runtime": 153.4919,
781
+ "eval_samples_per_second": 0.339,
782
+ "eval_steps_per_second": 0.169,
783
+ "step": 450
784
+ },
785
+ {
786
+ "epoch": 0.91,
787
+ "grad_norm": 0.5635793209075928,
788
+ "learning_rate": 0.000139652870493992,
789
+ "loss": 1.4892,
790
+ "step": 455
791
+ },
792
+ {
793
+ "epoch": 0.92,
794
+ "grad_norm": 0.7134138941764832,
795
+ "learning_rate": 0.0001389853137516689,
796
+ "loss": 1.6564,
797
+ "step": 460
798
+ },
799
+ {
800
+ "epoch": 0.93,
801
+ "grad_norm": 1.0067154169082642,
802
+ "learning_rate": 0.0001383177570093458,
803
+ "loss": 1.8738,
804
+ "step": 465
805
+ },
806
+ {
807
+ "epoch": 0.94,
808
+ "grad_norm": 0.7665852904319763,
809
+ "learning_rate": 0.0001376502002670227,
810
+ "loss": 1.7151,
811
+ "step": 470
812
+ },
813
+ {
814
+ "epoch": 0.95,
815
+ "grad_norm": 0.5606282949447632,
816
+ "learning_rate": 0.0001369826435246996,
817
+ "loss": 1.7477,
818
+ "step": 475
819
+ },
820
+ {
821
+ "epoch": 0.95,
822
+ "eval_loss": 1.5813050270080566,
823
+ "eval_runtime": 153.7414,
824
+ "eval_samples_per_second": 0.338,
825
+ "eval_steps_per_second": 0.169,
826
+ "step": 475
827
+ },
828
+ {
829
+ "epoch": 0.96,
830
+ "grad_norm": 0.48252391815185547,
831
+ "learning_rate": 0.0001363150867823765,
832
+ "loss": 1.6785,
833
+ "step": 480
834
+ },
835
+ {
836
+ "epoch": 0.97,
837
+ "grad_norm": 0.7147005200386047,
838
+ "learning_rate": 0.0001356475300400534,
839
+ "loss": 1.5719,
840
+ "step": 485
841
+ },
842
+ {
843
+ "epoch": 0.98,
844
+ "grad_norm": 0.5606821775436401,
845
+ "learning_rate": 0.0001349799732977303,
846
+ "loss": 1.6884,
847
+ "step": 490
848
+ },
849
+ {
850
+ "epoch": 0.99,
851
+ "grad_norm": 0.6102766394615173,
852
+ "learning_rate": 0.0001343124165554072,
853
+ "loss": 1.5512,
854
+ "step": 495
855
+ },
856
+ {
857
+ "epoch": 1.0,
858
+ "grad_norm": 0.7451562285423279,
859
+ "learning_rate": 0.00013364485981308413,
860
+ "loss": 1.5853,
861
+ "step": 500
862
+ },
863
+ {
864
+ "epoch": 1.0,
865
+ "eval_loss": 1.5768074989318848,
866
+ "eval_runtime": 153.3623,
867
+ "eval_samples_per_second": 0.339,
868
+ "eval_steps_per_second": 0.17,
869
+ "step": 500
870
+ },
871
+ {
872
+ "epoch": 1.01,
873
+ "grad_norm": 0.6127219200134277,
874
+ "learning_rate": 0.000132977303070761,
875
+ "loss": 1.5375,
876
+ "step": 505
877
+ },
878
+ {
879
+ "epoch": 1.02,
880
+ "grad_norm": 0.6121686100959778,
881
+ "learning_rate": 0.00013230974632843793,
882
+ "loss": 1.4151,
883
+ "step": 510
884
+ },
885
+ {
886
+ "epoch": 1.03,
887
+ "grad_norm": 0.8484262824058533,
888
+ "learning_rate": 0.00013164218958611483,
889
+ "loss": 1.6334,
890
+ "step": 515
891
+ },
892
+ {
893
+ "epoch": 1.04,
894
+ "grad_norm": 1.115262508392334,
895
+ "learning_rate": 0.00013097463284379173,
896
+ "loss": 1.5492,
897
+ "step": 520
898
+ },
899
+ {
900
+ "epoch": 1.05,
901
+ "grad_norm": 0.6433502435684204,
902
+ "learning_rate": 0.00013030707610146863,
903
+ "loss": 1.54,
904
+ "step": 525
905
+ },
906
+ {
907
+ "epoch": 1.05,
908
+ "eval_loss": 1.5698615312576294,
909
+ "eval_runtime": 154.4611,
910
+ "eval_samples_per_second": 0.337,
911
+ "eval_steps_per_second": 0.168,
912
+ "step": 525
913
+ },
914
+ {
915
+ "epoch": 1.06,
916
+ "grad_norm": 0.6583875417709351,
917
+ "learning_rate": 0.00012963951935914553,
918
+ "loss": 1.5267,
919
+ "step": 530
920
+ },
921
+ {
922
+ "epoch": 1.07,
923
+ "grad_norm": 0.7508483529090881,
924
+ "learning_rate": 0.00012897196261682243,
925
+ "loss": 1.7802,
926
+ "step": 535
927
+ },
928
+ {
929
+ "epoch": 1.08,
930
+ "grad_norm": 0.9875255227088928,
931
+ "learning_rate": 0.00012830440587449936,
932
+ "loss": 1.4288,
933
+ "step": 540
934
+ },
935
+ {
936
+ "epoch": 1.09,
937
+ "grad_norm": 0.5857439637184143,
938
+ "learning_rate": 0.00012763684913217623,
939
+ "loss": 1.4952,
940
+ "step": 545
941
+ },
942
+ {
943
+ "epoch": 1.1,
944
+ "grad_norm": 0.4867228865623474,
945
+ "learning_rate": 0.00012696929238985313,
946
+ "loss": 1.3511,
947
+ "step": 550
948
+ },
949
+ {
950
+ "epoch": 1.1,
951
+ "eval_loss": 1.5703368186950684,
952
+ "eval_runtime": 155.0663,
953
+ "eval_samples_per_second": 0.335,
954
+ "eval_steps_per_second": 0.168,
955
+ "step": 550
956
+ },
957
+ {
958
+ "epoch": 1.11,
959
+ "grad_norm": 1.0664533376693726,
960
+ "learning_rate": 0.00012630173564753006,
961
+ "loss": 1.5299,
962
+ "step": 555
963
+ },
964
+ {
965
+ "epoch": 1.12,
966
+ "grad_norm": 0.5459120869636536,
967
+ "learning_rate": 0.00012563417890520696,
968
+ "loss": 1.6358,
969
+ "step": 560
970
+ },
971
+ {
972
+ "epoch": 1.13,
973
+ "grad_norm": 0.7361356616020203,
974
+ "learning_rate": 0.00012496662216288386,
975
+ "loss": 1.1664,
976
+ "step": 565
977
+ },
978
+ {
979
+ "epoch": 1.1400000000000001,
980
+ "grad_norm": 0.6312634348869324,
981
+ "learning_rate": 0.00012429906542056076,
982
+ "loss": 1.4422,
983
+ "step": 570
984
+ },
985
+ {
986
+ "epoch": 1.15,
987
+ "grad_norm": 0.6251769065856934,
988
+ "learning_rate": 0.00012363150867823766,
989
+ "loss": 1.4539,
990
+ "step": 575
991
+ },
992
+ {
993
+ "epoch": 1.15,
994
+ "eval_loss": 1.582599401473999,
995
+ "eval_runtime": 161.8119,
996
+ "eval_samples_per_second": 0.321,
997
+ "eval_steps_per_second": 0.161,
998
+ "step": 575
999
+ },
1000
+ {
1001
+ "epoch": 1.16,
1002
+ "grad_norm": 0.7663933634757996,
1003
+ "learning_rate": 0.00012296395193591456,
1004
+ "loss": 1.4286,
1005
+ "step": 580
1006
+ },
1007
+ {
1008
+ "epoch": 1.17,
1009
+ "grad_norm": 0.5115808844566345,
1010
+ "learning_rate": 0.00012229639519359146,
1011
+ "loss": 1.5661,
1012
+ "step": 585
1013
+ },
1014
+ {
1015
+ "epoch": 1.18,
1016
+ "grad_norm": 0.5785268545150757,
1017
+ "learning_rate": 0.00012162883845126836,
1018
+ "loss": 1.6004,
1019
+ "step": 590
1020
+ },
1021
+ {
1022
+ "epoch": 1.19,
1023
+ "grad_norm": 1.1874048709869385,
1024
+ "learning_rate": 0.00012096128170894527,
1025
+ "loss": 1.4923,
1026
+ "step": 595
1027
+ },
1028
+ {
1029
+ "epoch": 1.2,
1030
+ "grad_norm": 0.708734393119812,
1031
+ "learning_rate": 0.00012029372496662217,
1032
+ "loss": 1.6751,
1033
+ "step": 600
1034
+ },
1035
+ {
1036
+ "epoch": 1.2,
1037
+ "eval_loss": 1.568034291267395,
1038
+ "eval_runtime": 157.401,
1039
+ "eval_samples_per_second": 0.33,
1040
+ "eval_steps_per_second": 0.165,
1041
+ "step": 600
1042
+ },
1043
+ {
1044
+ "epoch": 1.21,
1045
+ "grad_norm": 0.9826223850250244,
1046
+ "learning_rate": 0.00011962616822429906,
1047
+ "loss": 1.5187,
1048
+ "step": 605
1049
+ },
1050
+ {
1051
+ "epoch": 1.22,
1052
+ "grad_norm": 0.5815021991729736,
1053
+ "learning_rate": 0.00011895861148197598,
1054
+ "loss": 1.3991,
1055
+ "step": 610
1056
+ },
1057
+ {
1058
+ "epoch": 1.23,
1059
+ "grad_norm": 0.7048662304878235,
1060
+ "learning_rate": 0.00011829105473965287,
1061
+ "loss": 1.316,
1062
+ "step": 615
1063
+ },
1064
+ {
1065
+ "epoch": 1.24,
1066
+ "grad_norm": 0.667458176612854,
1067
+ "learning_rate": 0.00011762349799732977,
1068
+ "loss": 1.6166,
1069
+ "step": 620
1070
+ },
1071
+ {
1072
+ "epoch": 1.25,
1073
+ "grad_norm": 0.7350333333015442,
1074
+ "learning_rate": 0.00011695594125500668,
1075
+ "loss": 1.5397,
1076
+ "step": 625
1077
+ },
1078
+ {
1079
+ "epoch": 1.25,
1080
+ "eval_loss": 1.5718026161193848,
1081
+ "eval_runtime": 156.5658,
1082
+ "eval_samples_per_second": 0.332,
1083
+ "eval_steps_per_second": 0.166,
1084
+ "step": 625
1085
+ },
1086
+ {
1087
+ "epoch": 1.26,
1088
+ "grad_norm": 0.6255879402160645,
1089
+ "learning_rate": 0.00011628838451268358,
1090
+ "loss": 1.3613,
1091
+ "step": 630
1092
+ },
1093
+ {
1094
+ "epoch": 1.27,
1095
+ "grad_norm": 0.6177966594696045,
1096
+ "learning_rate": 0.0001156208277703605,
1097
+ "loss": 1.2452,
1098
+ "step": 635
1099
+ },
1100
+ {
1101
+ "epoch": 1.28,
1102
+ "grad_norm": 0.5699389576911926,
1103
+ "learning_rate": 0.0001149532710280374,
1104
+ "loss": 1.2252,
1105
+ "step": 640
1106
+ },
1107
+ {
1108
+ "epoch": 1.29,
1109
+ "grad_norm": 0.8304943442344666,
1110
+ "learning_rate": 0.00011428571428571428,
1111
+ "loss": 1.371,
1112
+ "step": 645
1113
+ },
1114
+ {
1115
+ "epoch": 1.3,
1116
+ "grad_norm": 0.8348840475082397,
1117
+ "learning_rate": 0.0001136181575433912,
1118
+ "loss": 1.5561,
1119
+ "step": 650
1120
+ },
1121
+ {
1122
+ "epoch": 1.3,
1123
+ "eval_loss": 1.5528948307037354,
1124
+ "eval_runtime": 155.2207,
1125
+ "eval_samples_per_second": 0.335,
1126
+ "eval_steps_per_second": 0.168,
1127
+ "step": 650
1128
+ },
1129
+ {
1130
+ "epoch": 1.31,
1131
+ "grad_norm": 0.6180741786956787,
1132
+ "learning_rate": 0.0001129506008010681,
1133
+ "loss": 1.3885,
1134
+ "step": 655
1135
+ },
1136
+ {
1137
+ "epoch": 1.32,
1138
+ "grad_norm": 0.9591236114501953,
1139
+ "learning_rate": 0.000112283044058745,
1140
+ "loss": 1.8704,
1141
+ "step": 660
1142
+ },
1143
+ {
1144
+ "epoch": 1.33,
1145
+ "grad_norm": 0.5324267148971558,
1146
+ "learning_rate": 0.00011161548731642191,
1147
+ "loss": 1.3346,
1148
+ "step": 665
1149
+ },
1150
+ {
1151
+ "epoch": 1.34,
1152
+ "grad_norm": 0.519713819026947,
1153
+ "learning_rate": 0.0001109479305740988,
1154
+ "loss": 1.553,
1155
+ "step": 670
1156
+ },
1157
+ {
1158
+ "epoch": 1.35,
1159
+ "grad_norm": 0.7792118191719055,
1160
+ "learning_rate": 0.0001102803738317757,
1161
+ "loss": 1.4646,
1162
+ "step": 675
1163
+ },
1164
+ {
1165
+ "epoch": 1.35,
1166
+ "eval_loss": 1.5499815940856934,
1167
+ "eval_runtime": 157.3901,
1168
+ "eval_samples_per_second": 0.33,
1169
+ "eval_steps_per_second": 0.165,
1170
+ "step": 675
1171
+ },
1172
+ {
1173
+ "epoch": 1.3599999999999999,
1174
+ "grad_norm": 0.7332251667976379,
1175
+ "learning_rate": 0.00010961281708945261,
1176
+ "loss": 1.5366,
1177
+ "step": 680
1178
+ },
1179
+ {
1180
+ "epoch": 1.37,
1181
+ "grad_norm": 1.0482912063598633,
1182
+ "learning_rate": 0.00010894526034712951,
1183
+ "loss": 1.7866,
1184
+ "step": 685
1185
+ },
1186
+ {
1187
+ "epoch": 1.38,
1188
+ "grad_norm": 0.6060436964035034,
1189
+ "learning_rate": 0.00010827770360480642,
1190
+ "loss": 1.3853,
1191
+ "step": 690
1192
+ },
1193
+ {
1194
+ "epoch": 1.3900000000000001,
1195
+ "grad_norm": 0.6550512313842773,
1196
+ "learning_rate": 0.00010761014686248332,
1197
+ "loss": 1.765,
1198
+ "step": 695
1199
+ },
1200
+ {
1201
+ "epoch": 1.4,
1202
+ "grad_norm": 0.8347830176353455,
1203
+ "learning_rate": 0.00010694259012016021,
1204
+ "loss": 1.4687,
1205
+ "step": 700
1206
+ },
1207
+ {
1208
+ "epoch": 1.4,
1209
+ "eval_loss": 1.5612692832946777,
1210
+ "eval_runtime": 166.6597,
1211
+ "eval_samples_per_second": 0.312,
1212
+ "eval_steps_per_second": 0.156,
1213
+ "step": 700
1214
+ },
1215
+ {
1216
+ "epoch": 1.41,
1217
+ "grad_norm": 0.6563287973403931,
1218
+ "learning_rate": 0.00010627503337783712,
1219
+ "loss": 1.3366,
1220
+ "step": 705
1221
+ },
1222
+ {
1223
+ "epoch": 1.42,
1224
+ "grad_norm": 0.6529929637908936,
1225
+ "learning_rate": 0.00010560747663551402,
1226
+ "loss": 1.6132,
1227
+ "step": 710
1228
+ },
1229
+ {
1230
+ "epoch": 1.43,
1231
+ "grad_norm": 0.504224956035614,
1232
+ "learning_rate": 0.00010493991989319092,
1233
+ "loss": 1.4062,
1234
+ "step": 715
1235
+ },
1236
+ {
1237
+ "epoch": 1.44,
1238
+ "grad_norm": 0.5341633558273315,
1239
+ "learning_rate": 0.00010427236315086784,
1240
+ "loss": 1.3092,
1241
+ "step": 720
1242
+ },
1243
+ {
1244
+ "epoch": 1.45,
1245
+ "grad_norm": 0.8322250843048096,
1246
+ "learning_rate": 0.00010360480640854472,
1247
+ "loss": 1.4273,
1248
+ "step": 725
1249
+ },
1250
+ {
1251
+ "epoch": 1.45,
1252
+ "eval_loss": 1.5545457601547241,
1253
+ "eval_runtime": 161.785,
1254
+ "eval_samples_per_second": 0.321,
1255
+ "eval_steps_per_second": 0.161,
1256
+ "step": 725
1257
+ },
1258
+ {
1259
+ "epoch": 1.46,
1260
+ "grad_norm": 0.694214940071106,
1261
+ "learning_rate": 0.00010293724966622162,
1262
+ "loss": 1.3338,
1263
+ "step": 730
1264
+ },
1265
+ {
1266
+ "epoch": 1.47,
1267
+ "grad_norm": 0.83758944272995,
1268
+ "learning_rate": 0.00010226969292389854,
1269
+ "loss": 1.4813,
1270
+ "step": 735
1271
+ },
1272
+ {
1273
+ "epoch": 1.48,
1274
+ "grad_norm": 0.7225409150123596,
1275
+ "learning_rate": 0.00010160213618157544,
1276
+ "loss": 1.4857,
1277
+ "step": 740
1278
+ },
1279
+ {
1280
+ "epoch": 1.49,
1281
+ "grad_norm": 0.6755008697509766,
1282
+ "learning_rate": 0.00010093457943925234,
1283
+ "loss": 1.452,
1284
+ "step": 745
1285
+ },
1286
+ {
1287
+ "epoch": 1.5,
1288
+ "grad_norm": 0.8779104351997375,
1289
+ "learning_rate": 0.00010026702269692925,
1290
+ "loss": 1.5225,
1291
+ "step": 750
1292
+ },
1293
+ {
1294
+ "epoch": 1.5,
1295
+ "eval_loss": 1.5489907264709473,
1296
+ "eval_runtime": 157.1309,
1297
+ "eval_samples_per_second": 0.331,
1298
+ "eval_steps_per_second": 0.165,
1299
+ "step": 750
1300
+ },
1301
+ {
1302
+ "epoch": 1.51,
1303
+ "grad_norm": 0.7593478560447693,
1304
+ "learning_rate": 9.959946595460614e-05,
1305
+ "loss": 1.7504,
1306
+ "step": 755
1307
+ },
1308
+ {
1309
+ "epoch": 1.52,
1310
+ "grad_norm": 0.7336363792419434,
1311
+ "learning_rate": 9.893190921228305e-05,
1312
+ "loss": 1.7467,
1313
+ "step": 760
1314
+ },
1315
+ {
1316
+ "epoch": 1.53,
1317
+ "grad_norm": 0.6226593852043152,
1318
+ "learning_rate": 9.826435246995995e-05,
1319
+ "loss": 1.2953,
1320
+ "step": 765
1321
+ },
1322
+ {
1323
+ "epoch": 1.54,
1324
+ "grad_norm": 0.652702808380127,
1325
+ "learning_rate": 9.759679572763686e-05,
1326
+ "loss": 1.6913,
1327
+ "step": 770
1328
+ },
1329
+ {
1330
+ "epoch": 1.55,
1331
+ "grad_norm": 0.6243285536766052,
1332
+ "learning_rate": 9.692923898531375e-05,
1333
+ "loss": 1.5129,
1334
+ "step": 775
1335
+ },
1336
+ {
1337
+ "epoch": 1.55,
1338
+ "eval_loss": 1.5401288270950317,
1339
+ "eval_runtime": 159.6754,
1340
+ "eval_samples_per_second": 0.326,
1341
+ "eval_steps_per_second": 0.163,
1342
+ "step": 775
1343
+ },
1344
+ {
1345
+ "epoch": 1.56,
1346
+ "grad_norm": 0.5181707739830017,
1347
+ "learning_rate": 9.626168224299066e-05,
1348
+ "loss": 1.4,
1349
+ "step": 780
1350
+ },
1351
+ {
1352
+ "epoch": 1.5699999999999998,
1353
+ "grad_norm": 0.4900369346141815,
1354
+ "learning_rate": 9.559412550066756e-05,
1355
+ "loss": 1.4891,
1356
+ "step": 785
1357
+ },
1358
+ {
1359
+ "epoch": 1.58,
1360
+ "grad_norm": 0.7415319085121155,
1361
+ "learning_rate": 9.492656875834446e-05,
1362
+ "loss": 1.2679,
1363
+ "step": 790
1364
+ },
1365
+ {
1366
+ "epoch": 1.5899999999999999,
1367
+ "grad_norm": 0.6447709798812866,
1368
+ "learning_rate": 9.425901201602136e-05,
1369
+ "loss": 1.4957,
1370
+ "step": 795
1371
+ },
1372
+ {
1373
+ "epoch": 1.6,
1374
+ "grad_norm": 0.6303768754005432,
1375
+ "learning_rate": 9.359145527369826e-05,
1376
+ "loss": 1.4617,
1377
+ "step": 800
1378
+ },
1379
+ {
1380
+ "epoch": 1.6,
1381
+ "eval_loss": 1.5425684452056885,
1382
+ "eval_runtime": 157.5599,
1383
+ "eval_samples_per_second": 0.33,
1384
+ "eval_steps_per_second": 0.165,
1385
+ "step": 800
1386
+ },
1387
+ {
1388
+ "epoch": 1.6099999999999999,
1389
+ "grad_norm": 0.6443194150924683,
1390
+ "learning_rate": 9.292389853137518e-05,
1391
+ "loss": 1.4427,
1392
+ "step": 805
1393
+ },
1394
+ {
1395
+ "epoch": 1.62,
1396
+ "grad_norm": 0.6477059125900269,
1397
+ "learning_rate": 9.225634178905206e-05,
1398
+ "loss": 1.3323,
1399
+ "step": 810
1400
+ },
1401
+ {
1402
+ "epoch": 1.63,
1403
+ "grad_norm": 1.2460426092147827,
1404
+ "learning_rate": 9.158878504672898e-05,
1405
+ "loss": 1.6938,
1406
+ "step": 815
1407
+ },
1408
+ {
1409
+ "epoch": 1.6400000000000001,
1410
+ "grad_norm": 0.5094366073608398,
1411
+ "learning_rate": 9.092122830440588e-05,
1412
+ "loss": 1.3493,
1413
+ "step": 820
1414
+ },
1415
+ {
1416
+ "epoch": 1.65,
1417
+ "grad_norm": 0.6189996004104614,
1418
+ "learning_rate": 9.025367156208279e-05,
1419
+ "loss": 1.5123,
1420
+ "step": 825
1421
+ },
1422
+ {
1423
+ "epoch": 1.65,
1424
+ "eval_loss": 1.5375796556472778,
1425
+ "eval_runtime": 156.9638,
1426
+ "eval_samples_per_second": 0.331,
1427
+ "eval_steps_per_second": 0.166,
1428
+ "step": 825
1429
+ },
1430
+ {
1431
+ "epoch": 1.6600000000000001,
1432
+ "grad_norm": 0.8235365748405457,
1433
+ "learning_rate": 8.958611481975968e-05,
1434
+ "loss": 1.3337,
1435
+ "step": 830
1436
+ },
1437
+ {
1438
+ "epoch": 1.67,
1439
+ "grad_norm": 0.6116329431533813,
1440
+ "learning_rate": 8.891855807743659e-05,
1441
+ "loss": 1.2533,
1442
+ "step": 835
1443
+ },
1444
+ {
1445
+ "epoch": 1.6800000000000002,
1446
+ "grad_norm": 0.597701370716095,
1447
+ "learning_rate": 8.825100133511349e-05,
1448
+ "loss": 1.4036,
1449
+ "step": 840
1450
+ },
1451
+ {
1452
+ "epoch": 1.69,
1453
+ "grad_norm": 0.5337836742401123,
1454
+ "learning_rate": 8.758344459279039e-05,
1455
+ "loss": 1.2977,
1456
+ "step": 845
1457
+ },
1458
+ {
1459
+ "epoch": 1.7,
1460
+ "grad_norm": 0.5650852918624878,
1461
+ "learning_rate": 8.691588785046729e-05,
1462
+ "loss": 1.4909,
1463
+ "step": 850
1464
+ },
1465
+ {
1466
+ "epoch": 1.7,
1467
+ "eval_loss": 1.5355829000473022,
1468
+ "eval_runtime": 156.0274,
1469
+ "eval_samples_per_second": 0.333,
1470
+ "eval_steps_per_second": 0.167,
1471
+ "step": 850
1472
+ },
1473
+ {
1474
+ "epoch": 1.71,
1475
+ "grad_norm": 0.7378519773483276,
1476
+ "learning_rate": 8.62483311081442e-05,
1477
+ "loss": 1.6229,
1478
+ "step": 855
1479
+ },
1480
+ {
1481
+ "epoch": 1.72,
1482
+ "grad_norm": 0.6510607600212097,
1483
+ "learning_rate": 8.55807743658211e-05,
1484
+ "loss": 1.3392,
1485
+ "step": 860
1486
+ },
1487
+ {
1488
+ "epoch": 1.73,
1489
+ "grad_norm": 0.6023868918418884,
1490
+ "learning_rate": 8.4913217623498e-05,
1491
+ "loss": 1.5289,
1492
+ "step": 865
1493
+ },
1494
+ {
1495
+ "epoch": 1.74,
1496
+ "grad_norm": 0.7228203415870667,
1497
+ "learning_rate": 8.42456608811749e-05,
1498
+ "loss": 1.5144,
1499
+ "step": 870
1500
+ },
1501
+ {
1502
+ "epoch": 1.75,
1503
+ "grad_norm": 0.7221407294273376,
1504
+ "learning_rate": 8.35781041388518e-05,
1505
+ "loss": 1.2201,
1506
+ "step": 875
1507
+ },
1508
+ {
1509
+ "epoch": 1.75,
1510
+ "eval_loss": 1.5325042009353638,
1511
+ "eval_runtime": 157.2229,
1512
+ "eval_samples_per_second": 0.331,
1513
+ "eval_steps_per_second": 0.165,
1514
+ "step": 875
1515
+ },
1516
+ {
1517
+ "epoch": 1.76,
1518
+ "grad_norm": 0.48412030935287476,
1519
+ "learning_rate": 8.29105473965287e-05,
1520
+ "loss": 1.5154,
1521
+ "step": 880
1522
+ },
1523
+ {
1524
+ "epoch": 1.77,
1525
+ "grad_norm": 0.7840531468391418,
1526
+ "learning_rate": 8.22429906542056e-05,
1527
+ "loss": 1.7819,
1528
+ "step": 885
1529
+ },
1530
+ {
1531
+ "epoch": 1.78,
1532
+ "grad_norm": 0.6077267527580261,
1533
+ "learning_rate": 8.157543391188252e-05,
1534
+ "loss": 1.4317,
1535
+ "step": 890
1536
+ },
1537
+ {
1538
+ "epoch": 1.79,
1539
+ "grad_norm": 0.6933810114860535,
1540
+ "learning_rate": 8.090787716955942e-05,
1541
+ "loss": 1.7069,
1542
+ "step": 895
1543
+ },
1544
+ {
1545
+ "epoch": 1.8,
1546
+ "grad_norm": 0.7501831650733948,
1547
+ "learning_rate": 8.024032042723632e-05,
1548
+ "loss": 1.2877,
1549
+ "step": 900
1550
+ },
1551
+ {
1552
+ "epoch": 1.8,
1553
+ "eval_loss": 1.5253818035125732,
1554
+ "eval_runtime": 156.1433,
1555
+ "eval_samples_per_second": 0.333,
1556
+ "eval_steps_per_second": 0.167,
1557
+ "step": 900
1558
+ },
1559
+ {
1560
+ "epoch": 1.81,
1561
+ "grad_norm": 0.6081863045692444,
1562
+ "learning_rate": 7.957276368491322e-05,
1563
+ "loss": 1.2527,
1564
+ "step": 905
1565
+ },
1566
+ {
1567
+ "epoch": 1.8199999999999998,
1568
+ "grad_norm": 0.6956006288528442,
1569
+ "learning_rate": 7.890520694259013e-05,
1570
+ "loss": 1.4184,
1571
+ "step": 910
1572
+ },
1573
+ {
1574
+ "epoch": 1.83,
1575
+ "grad_norm": 0.5345895886421204,
1576
+ "learning_rate": 7.823765020026703e-05,
1577
+ "loss": 1.6186,
1578
+ "step": 915
1579
+ },
1580
+ {
1581
+ "epoch": 1.8399999999999999,
1582
+ "grad_norm": 0.682368814945221,
1583
+ "learning_rate": 7.757009345794393e-05,
1584
+ "loss": 1.3062,
1585
+ "step": 920
1586
+ },
1587
+ {
1588
+ "epoch": 1.85,
1589
+ "grad_norm": 0.7534874677658081,
1590
+ "learning_rate": 7.690253671562083e-05,
1591
+ "loss": 1.3955,
1592
+ "step": 925
1593
+ },
1594
+ {
1595
+ "epoch": 1.85,
1596
+ "eval_loss": 1.5264804363250732,
1597
+ "eval_runtime": 157.3189,
1598
+ "eval_samples_per_second": 0.331,
1599
+ "eval_steps_per_second": 0.165,
1600
+ "step": 925
1601
+ },
1602
+ {
1603
+ "epoch": 1.8599999999999999,
1604
+ "grad_norm": 0.8998626470565796,
1605
+ "learning_rate": 7.623497997329774e-05,
1606
+ "loss": 1.6625,
1607
+ "step": 930
1608
+ },
1609
+ {
1610
+ "epoch": 1.87,
1611
+ "grad_norm": 0.5620250105857849,
1612
+ "learning_rate": 7.556742323097463e-05,
1613
+ "loss": 1.6022,
1614
+ "step": 935
1615
+ },
1616
+ {
1617
+ "epoch": 1.88,
1618
+ "grad_norm": 0.656494677066803,
1619
+ "learning_rate": 7.489986648865154e-05,
1620
+ "loss": 1.3404,
1621
+ "step": 940
1622
+ },
1623
+ {
1624
+ "epoch": 1.8900000000000001,
1625
+ "grad_norm": 1.0331978797912598,
1626
+ "learning_rate": 7.423230974632844e-05,
1627
+ "loss": 1.6219,
1628
+ "step": 945
1629
+ },
1630
+ {
1631
+ "epoch": 1.9,
1632
+ "grad_norm": 0.7671311497688293,
1633
+ "learning_rate": 7.356475300400534e-05,
1634
+ "loss": 1.5102,
1635
+ "step": 950
1636
+ },
1637
+ {
1638
+ "epoch": 1.9,
1639
+ "eval_loss": 1.5233145952224731,
1640
+ "eval_runtime": 156.2243,
1641
+ "eval_samples_per_second": 0.333,
1642
+ "eval_steps_per_second": 0.166,
1643
+ "step": 950
1644
+ },
1645
+ {
1646
+ "epoch": 1.9100000000000001,
1647
+ "grad_norm": 0.514800488948822,
1648
+ "learning_rate": 7.289719626168224e-05,
1649
+ "loss": 1.4231,
1650
+ "step": 955
1651
+ },
1652
+ {
1653
+ "epoch": 1.92,
1654
+ "grad_norm": 1.1229217052459717,
1655
+ "learning_rate": 7.222963951935914e-05,
1656
+ "loss": 1.3732,
1657
+ "step": 960
1658
+ },
1659
+ {
1660
+ "epoch": 1.9300000000000002,
1661
+ "grad_norm": 0.7795677185058594,
1662
+ "learning_rate": 7.156208277703606e-05,
1663
+ "loss": 1.242,
1664
+ "step": 965
1665
+ },
1666
+ {
1667
+ "epoch": 1.94,
1668
+ "grad_norm": 0.6444641351699829,
1669
+ "learning_rate": 7.089452603471294e-05,
1670
+ "loss": 1.437,
1671
+ "step": 970
1672
+ },
1673
+ {
1674
+ "epoch": 1.95,
1675
+ "grad_norm": 0.698694109916687,
1676
+ "learning_rate": 7.022696929238986e-05,
1677
+ "loss": 1.4972,
1678
+ "step": 975
1679
+ },
1680
+ {
1681
+ "epoch": 1.95,
1682
+ "eval_loss": 1.5205299854278564,
1683
+ "eval_runtime": 157.1769,
1684
+ "eval_samples_per_second": 0.331,
1685
+ "eval_steps_per_second": 0.165,
1686
+ "step": 975
1687
+ },
1688
+ {
1689
+ "epoch": 1.96,
1690
+ "grad_norm": 0.7859231233596802,
1691
+ "learning_rate": 6.955941255006676e-05,
1692
+ "loss": 1.4448,
1693
+ "step": 980
1694
+ },
1695
+ {
1696
+ "epoch": 1.97,
1697
+ "grad_norm": 0.6304051876068115,
1698
+ "learning_rate": 6.889185580774367e-05,
1699
+ "loss": 1.5132,
1700
+ "step": 985
1701
+ },
1702
+ {
1703
+ "epoch": 1.98,
1704
+ "grad_norm": 0.6905663013458252,
1705
+ "learning_rate": 6.822429906542056e-05,
1706
+ "loss": 1.5612,
1707
+ "step": 990
1708
+ },
1709
+ {
1710
+ "epoch": 1.99,
1711
+ "grad_norm": 0.6258041262626648,
1712
+ "learning_rate": 6.755674232309747e-05,
1713
+ "loss": 1.4587,
1714
+ "step": 995
1715
+ },
1716
+ {
1717
+ "epoch": 2.0,
1718
+ "grad_norm": 0.7253485918045044,
1719
+ "learning_rate": 6.688918558077437e-05,
1720
+ "loss": 1.4498,
1721
+ "step": 1000
1722
+ },
1723
+ {
1724
+ "epoch": 2.0,
1725
+ "eval_loss": 1.5251318216323853,
1726
+ "eval_runtime": 156.2874,
1727
+ "eval_samples_per_second": 0.333,
1728
+ "eval_steps_per_second": 0.166,
1729
+ "step": 1000
1730
+ },
1731
+ {
1732
+ "epoch": 2.01,
1733
+ "grad_norm": 0.5841794013977051,
1734
+ "learning_rate": 6.622162883845127e-05,
1735
+ "loss": 1.5441,
1736
+ "step": 1005
1737
+ },
1738
+ {
1739
+ "epoch": 2.02,
1740
+ "grad_norm": 0.7779578566551208,
1741
+ "learning_rate": 6.555407209612817e-05,
1742
+ "loss": 1.4619,
1743
+ "step": 1010
1744
+ },
1745
+ {
1746
+ "epoch": 2.028,
1747
+ "eval_loss": 1.5253530740737915,
1748
+ "eval_runtime": 168.5711,
1749
+ "eval_samples_per_second": 0.308,
1750
+ "eval_steps_per_second": 0.154,
1751
+ "step": 1014
1752
+ }
1753
+ ],
1754
+ "logging_steps": 5,
1755
+ "max_steps": 1500,
1756
+ "num_input_tokens_seen": 0,
1757
+ "num_train_epochs": 3,
1758
+ "save_steps": 25,
1759
+ "stateful_callbacks": {
1760
+ "TrainerControl": {
1761
+ "args": {
1762
+ "should_epoch_stop": false,
1763
+ "should_evaluate": false,
1764
+ "should_log": false,
1765
+ "should_save": true,
1766
+ "should_training_stop": false
1767
+ },
1768
+ "attributes": {}
1769
+ }
1770
+ },
1771
+ "total_flos": 8.503998201987072e+16,
1772
+ "train_batch_size": 2,
1773
+ "trial_name": null,
1774
+ "trial_params": null
1775
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efdf0602db8b813b15ad61fa70a59474f91c589cd332f4b34932102b391be55f
3
+ size 5432
vocab.json ADDED
The diff for this file is too large to render. See raw diff