g4rg commited on
Commit
386094d
·
verified ·
1 Parent(s): d83e0ab

Model save

Browse files
Files changed (1) hide show
  1. README.md +191 -0
README.md ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: unsloth/Mistral-Small-Instruct-2409
3
+ library_name: peft
4
+ tags:
5
+ - axolotl
6
+ - generated_from_trainer
7
+ model-index:
8
+ - name: mistral-small-dampf-qlora
9
+ results: []
10
+ ---
11
+
12
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
13
+ should probably proofread and complete it, then remove this comment. -->
14
+
15
+ [<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
16
+ <details><summary>See axolotl config</summary>
17
+
18
+ axolotl version: `0.4.1`
19
+ ```yaml
20
+ # huggingface-cli login --token $hf_key && wandb login $wandb_key
21
+ # python -m axolotl.cli.preprocess ms-creative.yml
22
+ # accelerate launch -m axolotl.cli.train ms-creative.yml
23
+ # python -m axolotl.cli.merge_lora ms-creative.yml
24
+ # huggingface-cli upload Columbidae/ms-type2-creative train-workspace/merged . --private
25
+
26
+ # Model
27
+ base_model: unsloth/Mistral-Small-Instruct-2409
28
+ model_type: AutoModelForCausalLM
29
+ tokenizer_type: AutoTokenizer
30
+
31
+ load_in_8bit: false
32
+ load_in_4bit: true
33
+ strict: false
34
+ bf16: true
35
+ fp16:
36
+ tf32: false
37
+ flash_attention: true
38
+ special_tokens:
39
+
40
+ # Output
41
+ output_dir: ./ms-creative
42
+ hub_model_id: BeaverAI/mistral-small-dampf-qlora
43
+ hub_strategy: "checkpoint"
44
+ resume_from_checkpoint:
45
+ saves_per_epoch: 5
46
+
47
+ # Data
48
+ sequence_len: 16384 # fits
49
+ min_sample_len: 128
50
+ dataset_prepared_path: last_run_prepared
51
+ datasets:
52
+ - path: Dampfinchen/Creative_Writing_Multiturn
53
+ type: custommistralv3
54
+ warmup_steps: 20
55
+ shuffle_merged_datasets: true
56
+ sample_packing: true
57
+ pad_to_sequence_len: true
58
+
59
+ # Batching
60
+ num_epochs: 1
61
+ gradient_accumulation_steps: 1
62
+ micro_batch_size: 5
63
+ eval_batch_size: 5
64
+
65
+ # Evaluation
66
+ val_set_size: 100
67
+ evals_per_epoch: 5
68
+ eval_table_size:
69
+ eval_max_new_tokens: 256
70
+ eval_sample_packing: false
71
+
72
+ save_safetensors: true
73
+
74
+ mlflow_tracking_uri: http://127.0.0.1:7860
75
+ mlflow_experiment_name: Default
76
+ # WandB
77
+ #wandb_project: Mistral-Small-Creative-Multiturn
78
+ #wandb_entity:
79
+
80
+ gradient_checkpointing: 'unsloth'
81
+ gradient_checkpointing_kwargs:
82
+ use_reentrant: true
83
+
84
+ unsloth_cross_entropy_loss: true
85
+ #unsloth_lora_mlp: true
86
+ #unsloth_lora_qkv: true
87
+ #unsloth_lora_o: true
88
+
89
+ # LoRA
90
+ adapter: qlora
91
+ lora_model_dir:
92
+ lora_r: 64
93
+ lora_alpha: 128
94
+ lora_dropout: 0.125
95
+ lora_target_linear:
96
+ lora_fan_in_fan_out:
97
+ lora_target_modules:
98
+ - gate_proj
99
+ - down_proj
100
+ - up_proj
101
+ - q_proj
102
+ - v_proj
103
+ - k_proj
104
+ - o_proj
105
+ lora_modules_to_save:
106
+
107
+ # Optimizer
108
+ optimizer: paged_adamw_8bit # adamw_8bit
109
+ lr_scheduler: cosine
110
+ learning_rate: 0.00005
111
+ cosine_min_lr_ratio: 0.1
112
+ weight_decay: 0.01
113
+ max_grad_norm: 1.0
114
+
115
+ # Misc
116
+ train_on_inputs: false
117
+ group_by_length: false
118
+ early_stopping_patience:
119
+ local_rank:
120
+ logging_steps: 1
121
+ xformers_attention:
122
+ debug:
123
+ deepspeed: deepspeed_configs/zero3.json # previously blank
124
+ fsdp:
125
+ fsdp_config:
126
+
127
+ plugins:
128
+ - axolotl.integrations.liger.LigerPlugin
129
+ liger_rope: true
130
+ liger_rms_norm: true
131
+ liger_swiglu: true
132
+ liger_fused_linear_cross_entropy: true
133
+
134
+ ```
135
+
136
+ </details><br>
137
+
138
+ # mistral-small-dampf-qlora
139
+
140
+ This model is a fine-tuned version of [unsloth/Mistral-Small-Instruct-2409](https://huggingface.co/unsloth/Mistral-Small-Instruct-2409) on the None dataset.
141
+ It achieves the following results on the evaluation set:
142
+ - Loss: 1.0232
143
+
144
+ ## Model description
145
+
146
+ More information needed
147
+
148
+ ## Intended uses & limitations
149
+
150
+ More information needed
151
+
152
+ ## Training and evaluation data
153
+
154
+ More information needed
155
+
156
+ ## Training procedure
157
+
158
+ ### Training hyperparameters
159
+
160
+ The following hyperparameters were used during training:
161
+ - learning_rate: 5e-05
162
+ - train_batch_size: 5
163
+ - eval_batch_size: 5
164
+ - seed: 42
165
+ - distributed_type: multi-GPU
166
+ - num_devices: 6
167
+ - total_train_batch_size: 30
168
+ - total_eval_batch_size: 30
169
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
170
+ - lr_scheduler_type: cosine
171
+ - lr_scheduler_warmup_steps: 20
172
+ - num_epochs: 1
173
+
174
+ ### Training results
175
+
176
+ | Training Loss | Epoch | Step | Validation Loss |
177
+ |:-------------:|:------:|:----:|:---------------:|
178
+ | 1.477 | 0.0065 | 1 | 1.3211 |
179
+ | 1.2338 | 0.2065 | 32 | 1.1156 |
180
+ | 1.1973 | 0.4129 | 64 | 1.0707 |
181
+ | 1.301 | 0.6194 | 96 | 1.0402 |
182
+ | 1.1063 | 0.8258 | 128 | 1.0232 |
183
+
184
+
185
+ ### Framework versions
186
+
187
+ - PEFT 0.13.0
188
+ - Transformers 4.45.1
189
+ - Pytorch 2.3.1
190
+ - Datasets 2.21.0
191
+ - Tokenizers 0.20.0