dimasik87 commited on
Commit
81c0629
·
verified ·
1 Parent(s): 202452b

End of training

Browse files
Files changed (2) hide show
  1. README.md +23 -19
  2. adapter_model.bin +2 -2
README.md CHANGED
@@ -44,11 +44,11 @@ eval_max_new_tokens: 128
44
  eval_table_size: null
45
  evals_per_epoch: 4
46
  flash_attention: true
47
- fp16: true
48
  fsdp: null
49
  fsdp_config: null
50
  gradient_accumulation_steps: 4
51
- gradient_checkpointing: true
52
  group_by_length: false
53
  hub_model_id: dimasik87/de33f694-50e1-4884-8566-140c2e9ee8b8
54
  hub_repo: null
@@ -59,30 +59,30 @@ load_in_4bit: false
59
  load_in_8bit: false
60
  local_rank: null
61
  logging_steps: 1
62
- lora_alpha: 16
63
- lora_dropout: 0.1
64
  lora_fan_in_fan_out: null
65
  lora_model_dir: null
66
- lora_r: 8
67
  lora_target_linear: true
68
  lr_scheduler: cosine
69
  max_memory:
70
  0: 70GiB
71
- max_steps: 25
72
  micro_batch_size: 1
73
  mlflow_experiment_name: /tmp/823d37b8588de3b9_train_data.json
74
  model_type: AutoModelForCausalLM
75
- num_epochs: 3
76
  optimizer: adamw_torch
77
  output_dir: miner_id_24
78
  pad_to_sequence_len: true
79
  resume_from_checkpoint: null
80
  s2_attention: null
81
  sample_packing: false
82
- saves_per_epoch: 3
83
  sequence_len: 2028
84
  strict: false
85
- tf32: false
86
  tokenizer_type: AutoTokenizer
87
  train_on_inputs: false
88
  trust_remote_code: true
@@ -105,7 +105,7 @@ xformers_attention: null
105
 
106
  This model is a fine-tuned version of [NousResearch/Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B) on the None dataset.
107
  It achieves the following results on the evaluation set:
108
- - Loss: 0.2460
109
 
110
  ## Model description
111
 
@@ -133,21 +133,25 @@ The following hyperparameters were used during training:
133
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
134
  - lr_scheduler_type: cosine
135
  - lr_scheduler_warmup_steps: 10
136
- - training_steps: 25
137
 
138
  ### Training results
139
 
140
  | Training Loss | Epoch | Step | Validation Loss |
141
  |:-------------:|:------:|:----:|:---------------:|
142
  | 0.9466 | 0.0001 | 1 | 1.1514 |
143
- | 0.9876 | 0.0003 | 3 | 1.1419 |
144
- | 1.311 | 0.0005 | 6 | 0.9818 |
145
- | 0.8303 | 0.0008 | 9 | 0.6860 |
146
- | 0.4477 | 0.0011 | 12 | 0.4301 |
147
- | 0.2958 | 0.0014 | 15 | 0.2870 |
148
- | 0.2894 | 0.0016 | 18 | 0.2600 |
149
- | 0.1838 | 0.0019 | 21 | 0.2497 |
150
- | 0.2865 | 0.0022 | 24 | 0.2460 |
 
 
 
 
151
 
152
 
153
  ### Framework versions
 
44
  eval_table_size: null
45
  evals_per_epoch: 4
46
  flash_attention: true
47
+ fp16: null
48
  fsdp: null
49
  fsdp_config: null
50
  gradient_accumulation_steps: 4
51
+ gradient_checkpointing: false
52
  group_by_length: false
53
  hub_model_id: dimasik87/de33f694-50e1-4884-8566-140c2e9ee8b8
54
  hub_repo: null
 
59
  load_in_8bit: false
60
  local_rank: null
61
  logging_steps: 1
62
+ lora_alpha: 32
63
+ lora_dropout: 0.05
64
  lora_fan_in_fan_out: null
65
  lora_model_dir: null
66
+ lora_r: 16
67
  lora_target_linear: true
68
  lr_scheduler: cosine
69
  max_memory:
70
  0: 70GiB
71
+ max_steps: 50
72
  micro_batch_size: 1
73
  mlflow_experiment_name: /tmp/823d37b8588de3b9_train_data.json
74
  model_type: AutoModelForCausalLM
75
+ num_epochs: 4
76
  optimizer: adamw_torch
77
  output_dir: miner_id_24
78
  pad_to_sequence_len: true
79
  resume_from_checkpoint: null
80
  s2_attention: null
81
  sample_packing: false
82
+ saves_per_epoch: 4
83
  sequence_len: 2028
84
  strict: false
85
+ tf32: true
86
  tokenizer_type: AutoTokenizer
87
  train_on_inputs: false
88
  trust_remote_code: true
 
105
 
106
  This model is a fine-tuned version of [NousResearch/Hermes-2-Pro-Llama-3-8B](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B) on the None dataset.
107
  It achieves the following results on the evaluation set:
108
+ - Loss: 0.2001
109
 
110
  ## Model description
111
 
 
133
  - optimizer: Use OptimizerNames.ADAMW_TORCH with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
134
  - lr_scheduler_type: cosine
135
  - lr_scheduler_warmup_steps: 10
136
+ - training_steps: 50
137
 
138
  ### Training results
139
 
140
  | Training Loss | Epoch | Step | Validation Loss |
141
  |:-------------:|:------:|:----:|:---------------:|
142
  | 0.9466 | 0.0001 | 1 | 1.1514 |
143
+ | 0.886 | 0.0004 | 4 | 1.0636 |
144
+ | 0.6905 | 0.0007 | 8 | 0.6157 |
145
+ | 0.3233 | 0.0011 | 12 | 0.2960 |
146
+ | 0.1823 | 0.0015 | 16 | 0.2611 |
147
+ | 0.1479 | 0.0018 | 20 | 0.2346 |
148
+ | 0.2695 | 0.0022 | 24 | 0.2255 |
149
+ | 0.1645 | 0.0025 | 28 | 0.2141 |
150
+ | 0.1804 | 0.0029 | 32 | 0.2088 |
151
+ | 0.2318 | 0.0033 | 36 | 0.2063 |
152
+ | 0.1563 | 0.0036 | 40 | 0.2020 |
153
+ | 0.1512 | 0.0040 | 44 | 0.2004 |
154
+ | 0.1529 | 0.0044 | 48 | 0.2001 |
155
 
156
 
157
  ### Framework versions
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf89dd606ee553bc0bfa09d864ad3b6eb8a44bb971145c53c61722e26da412f3
3
- size 84047370
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc348215c8f42868c0b42c19ff8cb067fb3846b0e10cf94d0cc54cdf760d4a64
3
+ size 167934026