lesso commited on
Commit
5cd23e6
·
verified ·
1 Parent(s): 646835a

End of training

Browse files
README.md CHANGED
@@ -36,7 +36,7 @@ early_stopping_patience: null
36
  eval_max_new_tokens: 128
37
  eval_table_size: null
38
  evals_per_epoch: 4
39
- flash_attention: false
40
  fp16: true
41
  fsdp: null
42
  fsdp_config: null
@@ -96,7 +96,7 @@ xformers_attention: null
96
 
97
  This model is a fine-tuned version of [NousResearch/Hermes-2-Theta-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B) on the None dataset.
98
  It achieves the following results on the evaluation set:
99
- - Loss: 3.3314
100
 
101
  ## Model description
102
 
@@ -119,8 +119,11 @@ The following hyperparameters were used during training:
119
  - train_batch_size: 1
120
  - eval_batch_size: 1
121
  - seed: 42
 
 
122
  - gradient_accumulation_steps: 4
123
- - total_train_batch_size: 4
 
124
  - optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
125
  - lr_scheduler_type: cosine
126
  - lr_scheduler_warmup_steps: 10
@@ -131,10 +134,10 @@ The following hyperparameters were used during training:
131
 
132
  | Training Loss | Epoch | Step | Validation Loss |
133
  |:-------------:|:------:|:----:|:---------------:|
134
- | 2.8212 | 0.0000 | 1 | 3.4763 |
135
- | 2.0631 | 0.0001 | 3 | 3.4705 |
136
- | 2.8174 | 0.0002 | 6 | 3.4303 |
137
- | 2.2349 | 0.0003 | 9 | 3.3314 |
138
 
139
 
140
  ### Framework versions
 
36
  eval_max_new_tokens: 128
37
  eval_table_size: null
38
  evals_per_epoch: 4
39
+ flash_attention: true
40
  fp16: true
41
  fsdp: null
42
  fsdp_config: null
 
96
 
97
  This model is a fine-tuned version of [NousResearch/Hermes-2-Theta-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Theta-Llama-3-8B) on the None dataset.
98
  It achieves the following results on the evaluation set:
99
+ - Loss: 3.2872
100
 
101
  ## Model description
102
 
 
119
  - train_batch_size: 1
120
  - eval_batch_size: 1
121
  - seed: 42
122
+ - distributed_type: multi-GPU
123
+ - num_devices: 2
124
  - gradient_accumulation_steps: 4
125
+ - total_train_batch_size: 8
126
+ - total_eval_batch_size: 2
127
  - optimizer: Use OptimizerNames.ADAMW_HF with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
128
  - lr_scheduler_type: cosine
129
  - lr_scheduler_warmup_steps: 10
 
134
 
135
  | Training Loss | Epoch | Step | Validation Loss |
136
  |:-------------:|:------:|:----:|:---------------:|
137
+ | 2.8063 | 0.0001 | 1 | 3.4762 |
138
+ | 2.565 | 0.0002 | 3 | 3.4688 |
139
+ | 3.1689 | 0.0004 | 6 | 3.4170 |
140
+ | 3.0218 | 0.0006 | 9 | 3.2872 |
141
 
142
 
143
  ### Framework versions
adapter_config.json CHANGED
@@ -20,11 +20,11 @@
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
23
- "q_proj",
24
- "gate_proj",
25
  "down_proj",
26
  "k_proj",
 
27
  "o_proj",
 
28
  "up_proj",
29
  "v_proj"
30
  ],
 
20
  "rank_pattern": {},
21
  "revision": null,
22
  "target_modules": [
 
 
23
  "down_proj",
24
  "k_proj",
25
+ "q_proj",
26
  "o_proj",
27
+ "gate_proj",
28
  "up_proj",
29
  "v_proj"
30
  ],
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:54c9d0f0f450aa4c0f0988c63997aeb4c797013d3d69f54d0c13e80251fce3e4
3
  size 2185410770
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c105bbd194dd6d97065e1ede060883fa3a33cb7ae3d4bf106390b606e08b078
3
  size 2185410770
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1ad7a9c938ab5b00cf94f66c349574d56756741215e58fcd56abab09d577b915
3
  size 2185308208
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bddebe6170f1f0aaa1c00b855be5e6c0057303bfc9b95d4017dd4ab8f0bac0a0
3
  size 2185308208
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4cb41fba4264376a3ccfb8c093be5867221b5fa61290459b0e68d032b827a444
3
  size 6776
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d638e8d3c0cc1e7757dcd32106c2914b72f1485e5fa91782585f316d44ddeb6
3
  size 6776