jimboHsueh commited on
Commit
4108e31
·
1 Parent(s): e8efc25

End of training

Browse files
Files changed (41) hide show
  1. checkpoint-1000/README.md +19 -0
  2. checkpoint-1000/adapter_config.json +3 -3
  3. checkpoint-1000/adapter_model.safetensors +2 -2
  4. checkpoint-1000/optimizer.pt +1 -1
  5. checkpoint-1000/rng_state.pth +1 -1
  6. checkpoint-1000/scheduler.pt +1 -1
  7. checkpoint-1000/trainer_state.json +33 -1173
  8. checkpoint-1000/training_args.bin +1 -1
  9. checkpoint-1500/README.md +220 -0
  10. checkpoint-1500/adapter_config.json +23 -0
  11. checkpoint-1500/adapter_model.safetensors +3 -0
  12. checkpoint-1500/optimizer.pt +3 -0
  13. checkpoint-1500/rng_state.pth +3 -0
  14. checkpoint-1500/scheduler.pt +3 -0
  15. checkpoint-1500/trainer_state.json +109 -0
  16. checkpoint-1500/training_args.bin +3 -0
  17. checkpoint-2000/README.md +220 -0
  18. checkpoint-2000/adapter_config.json +23 -0
  19. checkpoint-2000/adapter_model.safetensors +3 -0
  20. checkpoint-2000/optimizer.pt +3 -0
  21. checkpoint-2000/rng_state.pth +3 -0
  22. checkpoint-2000/scheduler.pt +3 -0
  23. checkpoint-2000/trainer_state.json +139 -0
  24. checkpoint-2000/training_args.bin +3 -0
  25. checkpoint-2500/README.md +220 -0
  26. checkpoint-2500/adapter_config.json +23 -0
  27. checkpoint-2500/adapter_model.safetensors +3 -0
  28. checkpoint-2500/optimizer.pt +3 -0
  29. checkpoint-2500/rng_state.pth +3 -0
  30. checkpoint-2500/scheduler.pt +3 -0
  31. checkpoint-2500/trainer_state.json +169 -0
  32. checkpoint-2500/training_args.bin +3 -0
  33. checkpoint-500/README.md +19 -0
  34. checkpoint-500/adapter_config.json +3 -3
  35. checkpoint-500/adapter_model.safetensors +2 -2
  36. checkpoint-500/optimizer.pt +1 -1
  37. checkpoint-500/rng_state.pth +1 -1
  38. checkpoint-500/scheduler.pt +1 -1
  39. checkpoint-500/trainer_state.json +13 -583
  40. checkpoint-500/training_args.bin +1 -1
  41. runs/Nov22_05-17-44_3ed78d352be3/events.out.tfevents.1700630265.3ed78d352be3.3950.0 +3 -0
checkpoint-1000/README.md CHANGED
@@ -236,4 +236,23 @@ The following `bitsandbytes` quantization config was used during training:
236
  ### Framework versions
237
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  - PEFT 0.6.2
 
236
  ### Framework versions
237
 
238
 
239
+ - PEFT 0.6.2
240
+ ## Training procedure
241
+
242
+
243
+ The following `bitsandbytes` quantization config was used during training:
244
+ - quant_method: bitsandbytes
245
+ - load_in_8bit: False
246
+ - load_in_4bit: True
247
+ - llm_int8_threshold: 6.0
248
+ - llm_int8_skip_modules: None
249
+ - llm_int8_enable_fp32_cpu_offload: False
250
+ - llm_int8_has_fp16_weight: False
251
+ - bnb_4bit_quant_type: nf4
252
+ - bnb_4bit_use_double_quant: True
253
+ - bnb_4bit_compute_dtype: bfloat16
254
+
255
+ ### Framework versions
256
+
257
+
258
  - PEFT 0.6.2
checkpoint-1000/adapter_config.json CHANGED
@@ -12,12 +12,12 @@
12
  "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
- "r": 16,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "q_proj",
20
- "v_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
 
12
  "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
+ "r": 32,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
checkpoint-1000/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90d6b42ca9aab52aa9058699448f12cbfcaf4d7c059b132441ea740b33a4e61d
3
- size 33571624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fccc42f58dd6336fe96ad118dab74bd05fc22a9ec9e489b7c256c1d765119072
3
+ size 67126104
checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b5914e8a93a3cec1246ecb07f6955a8013d19382d7c7b2b998021a762720631
3
  size 1384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69ae342b4b17dffa0a39133642cb4bbf35a75e266c539638c16718cb4101dbc
3
  size 1384
checkpoint-1000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9e1040efdbf1a463e1563467f69831bd595036bc8ae487c51029986464b6bb93
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9916ccdf4a2c96f9abbaa93c24f261f87e3221b0d797f7d306707640c909c82d
3
  size 14244
checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:219c62af0a99d0a3db4c57e4189ef3b142499e30571ad5cc800b8b67c9ce1583
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78da2ad4b9ca31898b45b1311417fe63cc13ba94ce9477e40d7f195675677450
3
  size 1064
checkpoint-1000/trainer_state.json CHANGED
@@ -9,1211 +9,71 @@
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "learning_rate": 0.0,
14
- "loss": 0.0,
15
- "step": 5
16
- },
17
- {
18
- "epoch": 0.01,
19
- "learning_rate": 0.0,
20
- "loss": 0.0,
21
- "step": 10
22
- },
23
- {
24
- "epoch": 0.01,
25
- "learning_rate": 0.0,
26
- "loss": 16.3471,
27
- "step": 15
28
- },
29
- {
30
- "epoch": 0.02,
31
- "learning_rate": 0.0,
32
- "loss": 0.0,
33
- "step": 20
34
- },
35
- {
36
- "epoch": 0.02,
37
- "learning_rate": 0.0,
38
- "loss": 0.0,
39
- "step": 25
40
- },
41
- {
42
- "epoch": 0.02,
43
- "learning_rate": 0.0,
44
- "loss": 0.4615,
45
- "step": 30
46
- },
47
- {
48
- "epoch": 0.03,
49
- "learning_rate": 0.0,
50
- "loss": 0.194,
51
- "step": 35
52
- },
53
- {
54
- "epoch": 0.03,
55
- "learning_rate": 0.0,
56
- "loss": 0.0,
57
- "step": 40
58
- },
59
- {
60
- "epoch": 0.04,
61
- "learning_rate": 0.0,
62
- "loss": 0.0,
63
- "step": 45
64
- },
65
- {
66
- "epoch": 0.04,
67
- "learning_rate": 0.0,
68
- "loss": 0.9551,
69
- "step": 50
70
- },
71
- {
72
- "epoch": 0.04,
73
- "learning_rate": 0.0,
74
- "loss": 0.0,
75
- "step": 55
76
- },
77
- {
78
- "epoch": 0.05,
79
- "learning_rate": 0.0,
80
- "loss": 0.2261,
81
- "step": 60
82
- },
83
- {
84
- "epoch": 0.05,
85
- "learning_rate": 0.0,
86
- "loss": 0.0,
87
- "step": 65
88
- },
89
- {
90
- "epoch": 0.06,
91
- "learning_rate": 0.0,
92
- "loss": 0.0,
93
- "step": 70
94
- },
95
- {
96
- "epoch": 0.06,
97
- "learning_rate": 0.0,
98
- "loss": 0.0,
99
- "step": 75
100
- },
101
- {
102
- "epoch": 0.06,
103
- "learning_rate": 0.0,
104
- "loss": 0.0,
105
- "step": 80
106
- },
107
- {
108
- "epoch": 0.07,
109
- "learning_rate": 0.0,
110
- "loss": 1.9869,
111
- "step": 85
112
- },
113
- {
114
- "epoch": 0.07,
115
- "learning_rate": 0.0,
116
- "loss": 0.0,
117
- "step": 90
118
- },
119
- {
120
- "epoch": 0.08,
121
- "learning_rate": 0.0,
122
- "loss": 0.0,
123
- "step": 95
124
- },
125
- {
126
- "epoch": 0.08,
127
- "learning_rate": 0.0,
128
- "loss": 1.6293,
129
- "step": 100
130
- },
131
- {
132
- "epoch": 0.08,
133
- "learning_rate": 0.0,
134
- "loss": 0.0,
135
- "step": 105
136
- },
137
- {
138
- "epoch": 0.09,
139
- "learning_rate": 0.0,
140
- "loss": 0.0,
141
- "step": 110
142
- },
143
- {
144
- "epoch": 0.09,
145
- "learning_rate": 0.0,
146
- "loss": 0.0,
147
- "step": 115
148
- },
149
- {
150
- "epoch": 0.1,
151
- "learning_rate": 0.0,
152
- "loss": 2.0245,
153
- "step": 120
154
- },
155
- {
156
- "epoch": 0.1,
157
- "learning_rate": 0.0,
158
- "loss": 0.0,
159
- "step": 125
160
- },
161
- {
162
- "epoch": 0.1,
163
- "learning_rate": 0.0,
164
- "loss": 0.0,
165
- "step": 130
166
- },
167
- {
168
- "epoch": 0.11,
169
- "learning_rate": 0.0,
170
- "loss": 0.0,
171
- "step": 135
172
- },
173
- {
174
- "epoch": 0.11,
175
- "learning_rate": 0.0,
176
- "loss": 1.8265,
177
- "step": 140
178
- },
179
- {
180
- "epoch": 0.12,
181
- "learning_rate": 0.0,
182
- "loss": 0.0,
183
- "step": 145
184
- },
185
- {
186
- "epoch": 0.12,
187
- "learning_rate": 0.0,
188
- "loss": 0.5536,
189
- "step": 150
190
- },
191
- {
192
- "epoch": 0.12,
193
- "learning_rate": 0.0,
194
- "loss": 0.0,
195
- "step": 155
196
- },
197
- {
198
- "epoch": 0.13,
199
- "learning_rate": 0.0,
200
- "loss": 2.0011,
201
- "step": 160
202
- },
203
- {
204
- "epoch": 0.13,
205
- "learning_rate": 0.0,
206
- "loss": 0.0,
207
- "step": 165
208
- },
209
- {
210
- "epoch": 0.14,
211
- "learning_rate": 0.0,
212
- "loss": 0.0,
213
- "step": 170
214
- },
215
- {
216
- "epoch": 0.14,
217
- "learning_rate": 0.0,
218
- "loss": 0.1846,
219
- "step": 175
220
- },
221
- {
222
- "epoch": 0.14,
223
- "learning_rate": 0.0,
224
- "loss": 2.3248,
225
- "step": 180
226
- },
227
- {
228
- "epoch": 0.15,
229
- "learning_rate": 0.0,
230
- "loss": 3.966,
231
- "step": 185
232
- },
233
- {
234
- "epoch": 0.15,
235
- "learning_rate": 0.0,
236
- "loss": 0.0,
237
- "step": 190
238
- },
239
- {
240
- "epoch": 0.16,
241
- "learning_rate": 0.0,
242
- "loss": 0.0,
243
- "step": 195
244
- },
245
- {
246
- "epoch": 0.16,
247
- "learning_rate": 0.0,
248
- "loss": 49.8341,
249
- "step": 200
250
- },
251
- {
252
- "epoch": 0.16,
253
- "learning_rate": 0.0,
254
- "loss": 0.0,
255
- "step": 205
256
- },
257
- {
258
- "epoch": 0.17,
259
- "learning_rate": 0.0,
260
- "loss": 0.0,
261
- "step": 210
262
- },
263
- {
264
- "epoch": 0.17,
265
- "learning_rate": 0.0,
266
- "loss": 0.0,
267
- "step": 215
268
- },
269
- {
270
- "epoch": 0.18,
271
- "learning_rate": 0.0,
272
- "loss": 4.7387,
273
- "step": 220
274
- },
275
- {
276
- "epoch": 0.18,
277
- "learning_rate": 0.0,
278
- "loss": 27.6091,
279
- "step": 225
280
- },
281
- {
282
- "epoch": 0.18,
283
- "learning_rate": 0.0,
284
- "loss": 0.0,
285
- "step": 230
286
- },
287
- {
288
- "epoch": 0.19,
289
- "learning_rate": 0.0,
290
- "loss": 0.0,
291
- "step": 235
292
- },
293
- {
294
- "epoch": 0.19,
295
- "learning_rate": 0.0,
296
- "loss": 0.0,
297
- "step": 240
298
- },
299
- {
300
- "epoch": 0.2,
301
- "learning_rate": 0.0,
302
- "loss": 0.7769,
303
- "step": 245
304
- },
305
- {
306
- "epoch": 0.2,
307
- "learning_rate": 0.0,
308
- "loss": 1.2832,
309
- "step": 250
310
- },
311
- {
312
- "epoch": 0.2,
313
- "learning_rate": 0.0,
314
- "loss": 28.6253,
315
- "step": 255
316
- },
317
- {
318
- "epoch": 0.21,
319
- "learning_rate": 0.0,
320
- "loss": 0.0,
321
- "step": 260
322
- },
323
- {
324
- "epoch": 0.21,
325
- "learning_rate": 0.0,
326
- "loss": 1.311,
327
- "step": 265
328
- },
329
- {
330
- "epoch": 0.22,
331
- "learning_rate": 0.0,
332
- "loss": 0.0,
333
- "step": 270
334
- },
335
- {
336
- "epoch": 0.22,
337
- "learning_rate": 0.0,
338
- "loss": 0.0,
339
- "step": 275
340
- },
341
- {
342
- "epoch": 0.22,
343
- "learning_rate": 0.0,
344
- "loss": 0.0,
345
- "step": 280
346
- },
347
- {
348
- "epoch": 0.23,
349
- "learning_rate": 0.0,
350
- "loss": 0.0,
351
- "step": 285
352
- },
353
- {
354
- "epoch": 0.23,
355
- "learning_rate": 0.0,
356
- "loss": 1.0543,
357
- "step": 290
358
- },
359
- {
360
- "epoch": 0.24,
361
- "learning_rate": 0.0,
362
- "loss": 0.0,
363
- "step": 295
364
- },
365
- {
366
- "epoch": 0.24,
367
- "learning_rate": 0.0,
368
- "loss": 0.0,
369
- "step": 300
370
- },
371
- {
372
- "epoch": 0.24,
373
- "learning_rate": 0.0,
374
- "loss": 4.1379,
375
- "step": 305
376
- },
377
- {
378
- "epoch": 0.25,
379
- "learning_rate": 0.0,
380
- "loss": 0.0,
381
- "step": 310
382
- },
383
- {
384
- "epoch": 0.25,
385
- "learning_rate": 0.0,
386
- "loss": 0.2084,
387
- "step": 315
388
- },
389
- {
390
- "epoch": 0.26,
391
- "learning_rate": 0.0,
392
- "loss": 0.0,
393
- "step": 320
394
- },
395
- {
396
- "epoch": 0.26,
397
- "learning_rate": 0.0,
398
- "loss": 0.0999,
399
- "step": 325
400
- },
401
- {
402
- "epoch": 0.26,
403
- "learning_rate": 0.0,
404
- "loss": 0.0,
405
- "step": 330
406
- },
407
- {
408
- "epoch": 0.27,
409
- "learning_rate": 0.0,
410
- "loss": 0.0,
411
- "step": 335
412
- },
413
- {
414
- "epoch": 0.27,
415
- "learning_rate": 0.0,
416
- "loss": 0.0,
417
- "step": 340
418
- },
419
- {
420
- "epoch": 0.28,
421
- "learning_rate": 0.0,
422
- "loss": 0.2186,
423
- "step": 345
424
- },
425
- {
426
- "epoch": 0.28,
427
- "learning_rate": 0.0,
428
- "loss": 0.0,
429
- "step": 350
430
- },
431
- {
432
- "epoch": 0.28,
433
- "learning_rate": 0.0,
434
- "loss": 0.6567,
435
- "step": 355
436
- },
437
- {
438
- "epoch": 0.29,
439
- "learning_rate": 0.0,
440
- "loss": 0.1897,
441
- "step": 360
442
- },
443
- {
444
- "epoch": 0.29,
445
- "learning_rate": 0.0,
446
- "loss": 0.0,
447
- "step": 365
448
- },
449
- {
450
- "epoch": 0.3,
451
- "learning_rate": 0.0,
452
- "loss": 0.0,
453
- "step": 370
454
- },
455
- {
456
- "epoch": 0.3,
457
- "learning_rate": 0.0,
458
- "loss": 0.0,
459
- "step": 375
460
- },
461
- {
462
- "epoch": 0.3,
463
- "learning_rate": 0.0,
464
- "loss": 1.1873,
465
- "step": 380
466
- },
467
- {
468
- "epoch": 0.31,
469
- "learning_rate": 0.0,
470
- "loss": 0.5393,
471
- "step": 385
472
- },
473
- {
474
- "epoch": 0.31,
475
- "learning_rate": 0.0,
476
- "loss": 0.0,
477
- "step": 390
478
- },
479
- {
480
- "epoch": 0.32,
481
- "learning_rate": 0.0,
482
- "loss": 0.0,
483
- "step": 395
484
- },
485
- {
486
- "epoch": 0.32,
487
- "learning_rate": 0.0,
488
- "loss": 0.0,
489
- "step": 400
490
- },
491
- {
492
- "epoch": 0.32,
493
- "learning_rate": 0.0,
494
- "loss": 0.0,
495
- "step": 405
496
- },
497
- {
498
- "epoch": 0.33,
499
- "learning_rate": 0.0,
500
- "loss": 0.0,
501
- "step": 410
502
- },
503
- {
504
- "epoch": 0.33,
505
- "learning_rate": 0.0,
506
- "loss": 0.0,
507
- "step": 415
508
- },
509
- {
510
- "epoch": 0.34,
511
- "learning_rate": 0.0,
512
- "loss": 0.0,
513
- "step": 420
514
- },
515
- {
516
- "epoch": 0.34,
517
- "learning_rate": 0.0,
518
- "loss": 0.0,
519
- "step": 425
520
- },
521
- {
522
- "epoch": 0.34,
523
- "learning_rate": 0.0,
524
- "loss": 0.0,
525
- "step": 430
526
- },
527
- {
528
- "epoch": 0.35,
529
- "learning_rate": 0.0,
530
- "loss": 0.0,
531
- "step": 435
532
- },
533
- {
534
- "epoch": 0.35,
535
- "learning_rate": 0.0,
536
- "loss": 0.0,
537
- "step": 440
538
- },
539
- {
540
- "epoch": 0.36,
541
- "learning_rate": 0.0,
542
- "loss": 0.0,
543
- "step": 445
544
- },
545
- {
546
- "epoch": 0.36,
547
- "learning_rate": 0.0,
548
- "loss": 0.0,
549
- "step": 450
550
- },
551
- {
552
- "epoch": 0.36,
553
- "learning_rate": 0.0,
554
- "loss": 0.6623,
555
- "step": 455
556
- },
557
- {
558
- "epoch": 0.37,
559
- "learning_rate": 0.0,
560
- "loss": 7.193,
561
- "step": 460
562
- },
563
- {
564
- "epoch": 0.37,
565
- "learning_rate": 0.0,
566
- "loss": 62.3001,
567
- "step": 465
568
- },
569
- {
570
- "epoch": 0.38,
571
- "learning_rate": 0.0,
572
- "loss": 1.6018,
573
- "step": 470
574
- },
575
- {
576
- "epoch": 0.38,
577
- "learning_rate": 0.0,
578
- "loss": 0.0,
579
- "step": 475
580
- },
581
- {
582
- "epoch": 0.38,
583
- "learning_rate": 0.0,
584
- "loss": 0.0,
585
- "step": 480
586
- },
587
- {
588
- "epoch": 0.39,
589
- "learning_rate": 0.0,
590
- "loss": 0.1079,
591
- "step": 485
592
- },
593
- {
594
- "epoch": 0.39,
595
- "learning_rate": 0.0,
596
- "loss": 0.0,
597
- "step": 490
598
- },
599
- {
600
- "epoch": 0.4,
601
- "learning_rate": 0.0,
602
- "loss": 1.2356,
603
- "step": 495
604
- },
605
- {
606
- "epoch": 0.4,
607
- "learning_rate": 0.0,
608
- "loss": 0.0,
609
- "step": 500
610
- },
611
- {
612
- "epoch": 0.4,
613
- "learning_rate": 0.0,
614
- "loss": 0.0,
615
- "step": 505
616
- },
617
- {
618
- "epoch": 0.41,
619
- "learning_rate": 0.0,
620
- "loss": 0.0,
621
- "step": 510
622
- },
623
- {
624
- "epoch": 0.41,
625
- "learning_rate": 0.0,
626
- "loss": 0.0,
627
- "step": 515
628
- },
629
- {
630
- "epoch": 0.42,
631
- "learning_rate": 0.0,
632
- "loss": 0.7136,
633
- "step": 520
634
- },
635
- {
636
- "epoch": 0.42,
637
- "learning_rate": 0.0,
638
- "loss": 0.6036,
639
- "step": 525
640
- },
641
- {
642
- "epoch": 0.42,
643
- "learning_rate": 0.0,
644
- "loss": 0.0,
645
- "step": 530
646
- },
647
- {
648
- "epoch": 0.43,
649
- "learning_rate": 0.0,
650
- "loss": 0.1166,
651
- "step": 535
652
- },
653
- {
654
- "epoch": 0.43,
655
- "learning_rate": 0.0,
656
- "loss": 0.0,
657
- "step": 540
658
- },
659
- {
660
- "epoch": 0.44,
661
- "learning_rate": 0.0,
662
- "loss": 0.1405,
663
- "step": 545
664
- },
665
- {
666
- "epoch": 0.44,
667
- "learning_rate": 0.0,
668
- "loss": 0.0857,
669
- "step": 550
670
- },
671
- {
672
- "epoch": 0.44,
673
- "learning_rate": 0.0,
674
- "loss": 5.0663,
675
- "step": 555
676
- },
677
- {
678
- "epoch": 0.45,
679
- "learning_rate": 0.0,
680
- "loss": 0.0,
681
- "step": 560
682
- },
683
- {
684
- "epoch": 0.45,
685
- "learning_rate": 0.0,
686
- "loss": 0.0,
687
- "step": 565
688
- },
689
- {
690
- "epoch": 0.46,
691
- "learning_rate": 0.0,
692
- "loss": 0.4088,
693
- "step": 570
694
- },
695
- {
696
- "epoch": 0.46,
697
- "learning_rate": 0.0,
698
- "loss": 0.0,
699
- "step": 575
700
  },
701
  {
702
- "epoch": 0.46,
703
- "learning_rate": 0.0,
704
- "loss": 0.0,
705
- "step": 580
706
  },
707
  {
708
- "epoch": 0.47,
709
- "learning_rate": 0.0,
710
- "loss": 0.0,
711
- "step": 585
712
  },
713
  {
714
- "epoch": 0.47,
715
- "learning_rate": 0.0,
716
  "loss": 0.0,
717
- "step": 590
718
  },
719
  {
720
- "epoch": 0.48,
721
- "learning_rate": 0.0,
722
- "loss": 49.6922,
723
- "step": 595
724
  },
725
  {
726
  "epoch": 0.48,
727
- "learning_rate": 0.0,
728
- "loss": 1.6281,
729
  "step": 600
730
  },
731
- {
732
- "epoch": 0.48,
733
- "learning_rate": 0.0,
734
- "loss": 0.0,
735
- "step": 605
736
- },
737
- {
738
- "epoch": 0.49,
739
- "learning_rate": 0.0,
740
- "loss": 0.0,
741
- "step": 610
742
- },
743
- {
744
- "epoch": 0.49,
745
- "learning_rate": 0.0,
746
- "loss": 0.2691,
747
- "step": 615
748
- },
749
- {
750
- "epoch": 0.5,
751
- "learning_rate": 0.0,
752
- "loss": 0.0,
753
- "step": 620
754
- },
755
- {
756
- "epoch": 0.5,
757
- "learning_rate": 0.0,
758
- "loss": 0.0,
759
- "step": 625
760
- },
761
- {
762
- "epoch": 0.5,
763
- "learning_rate": 0.0,
764
- "loss": 0.0,
765
- "step": 630
766
- },
767
- {
768
- "epoch": 0.51,
769
- "learning_rate": 0.0,
770
- "loss": 4.2892,
771
- "step": 635
772
- },
773
- {
774
- "epoch": 0.51,
775
- "learning_rate": 0.0,
776
- "loss": 0.0,
777
- "step": 640
778
- },
779
- {
780
- "epoch": 0.52,
781
- "learning_rate": 0.0,
782
- "loss": 1.1384,
783
- "step": 645
784
- },
785
- {
786
- "epoch": 0.52,
787
- "learning_rate": 0.0,
788
- "loss": 13.9341,
789
- "step": 650
790
- },
791
- {
792
- "epoch": 0.52,
793
- "learning_rate": 0.0,
794
- "loss": 6.6407,
795
- "step": 655
796
- },
797
- {
798
- "epoch": 0.53,
799
- "learning_rate": 0.0,
800
- "loss": 0.0,
801
- "step": 660
802
- },
803
- {
804
- "epoch": 0.53,
805
- "learning_rate": 0.0,
806
- "loss": 0.0,
807
- "step": 665
808
- },
809
- {
810
- "epoch": 0.54,
811
- "learning_rate": 0.0,
812
- "loss": 0.0,
813
- "step": 670
814
- },
815
- {
816
- "epoch": 0.54,
817
- "learning_rate": 0.0,
818
- "loss": 5.8702,
819
- "step": 675
820
- },
821
- {
822
- "epoch": 0.54,
823
- "learning_rate": 0.0,
824
- "loss": 0.9148,
825
- "step": 680
826
- },
827
- {
828
- "epoch": 0.55,
829
- "learning_rate": 0.0,
830
- "loss": 0.8692,
831
- "step": 685
832
- },
833
- {
834
- "epoch": 0.55,
835
- "learning_rate": 0.0,
836
- "loss": 0.0,
837
- "step": 690
838
- },
839
  {
840
  "epoch": 0.56,
841
- "learning_rate": 0.0,
842
- "loss": 1.0171,
843
- "step": 695
844
- },
845
- {
846
- "epoch": 0.56,
847
- "learning_rate": 0.0,
848
- "loss": 0.0,
849
  "step": 700
850
  },
851
- {
852
- "epoch": 0.56,
853
- "learning_rate": 0.0,
854
- "loss": 0.3875,
855
- "step": 705
856
- },
857
- {
858
- "epoch": 0.57,
859
- "learning_rate": 0.0,
860
- "loss": 0.103,
861
- "step": 710
862
- },
863
- {
864
- "epoch": 0.57,
865
- "learning_rate": 0.0,
866
- "loss": 0.0,
867
- "step": 715
868
- },
869
- {
870
- "epoch": 0.58,
871
- "learning_rate": 0.0,
872
- "loss": 0.0,
873
- "step": 720
874
- },
875
- {
876
- "epoch": 0.58,
877
- "learning_rate": 0.0,
878
- "loss": 14.5489,
879
- "step": 725
880
- },
881
- {
882
- "epoch": 0.58,
883
- "learning_rate": 0.0,
884
- "loss": 0.0,
885
- "step": 730
886
- },
887
- {
888
- "epoch": 0.59,
889
- "learning_rate": 0.0,
890
- "loss": 0.0,
891
- "step": 735
892
- },
893
- {
894
- "epoch": 0.59,
895
- "learning_rate": 0.0,
896
- "loss": 0.0,
897
- "step": 740
898
- },
899
- {
900
- "epoch": 0.6,
901
- "learning_rate": 0.0,
902
- "loss": 0.0,
903
- "step": 745
904
- },
905
- {
906
- "epoch": 0.6,
907
- "learning_rate": 0.0,
908
- "loss": 0.0,
909
- "step": 750
910
- },
911
- {
912
- "epoch": 0.6,
913
- "learning_rate": 0.0,
914
- "loss": 0.0,
915
- "step": 755
916
- },
917
- {
918
- "epoch": 0.61,
919
- "learning_rate": 0.0,
920
- "loss": 1.3866,
921
- "step": 760
922
- },
923
- {
924
- "epoch": 0.61,
925
- "learning_rate": 0.0,
926
- "loss": 0.1129,
927
- "step": 765
928
- },
929
- {
930
- "epoch": 0.62,
931
- "learning_rate": 0.0,
932
- "loss": 0.6263,
933
- "step": 770
934
- },
935
- {
936
- "epoch": 0.62,
937
- "learning_rate": 0.0,
938
- "loss": 0.2559,
939
- "step": 775
940
- },
941
- {
942
- "epoch": 0.62,
943
- "learning_rate": 0.0,
944
- "loss": 0.0,
945
- "step": 780
946
- },
947
- {
948
- "epoch": 0.63,
949
- "learning_rate": 0.0,
950
- "loss": 0.0,
951
- "step": 785
952
- },
953
- {
954
- "epoch": 0.63,
955
- "learning_rate": 0.0,
956
- "loss": 0.0,
957
- "step": 790
958
- },
959
- {
960
- "epoch": 0.64,
961
- "learning_rate": 0.0,
962
- "loss": 0.0,
963
- "step": 795
964
- },
965
  {
966
  "epoch": 0.64,
967
- "learning_rate": 0.0,
968
- "loss": 0.0,
969
  "step": 800
970
  },
971
- {
972
- "epoch": 0.64,
973
- "learning_rate": 0.0,
974
- "loss": 0.6436,
975
- "step": 805
976
- },
977
- {
978
- "epoch": 0.65,
979
- "learning_rate": 0.0,
980
- "loss": 0.0,
981
- "step": 810
982
- },
983
- {
984
- "epoch": 0.65,
985
- "learning_rate": 0.0,
986
- "loss": 0.0,
987
- "step": 815
988
- },
989
- {
990
- "epoch": 0.66,
991
- "learning_rate": 0.0,
992
- "loss": 0.7532,
993
- "step": 820
994
- },
995
- {
996
- "epoch": 0.66,
997
- "learning_rate": 0.0,
998
- "loss": 0.0,
999
- "step": 825
1000
- },
1001
- {
1002
- "epoch": 0.66,
1003
- "learning_rate": 0.0,
1004
- "loss": 0.0,
1005
- "step": 830
1006
- },
1007
- {
1008
- "epoch": 0.67,
1009
- "learning_rate": 0.0,
1010
- "loss": 0.4458,
1011
- "step": 835
1012
- },
1013
- {
1014
- "epoch": 0.67,
1015
- "learning_rate": 0.0,
1016
- "loss": 0.0,
1017
- "step": 840
1018
- },
1019
- {
1020
- "epoch": 0.68,
1021
- "learning_rate": 0.0,
1022
- "loss": 0.0,
1023
- "step": 845
1024
- },
1025
- {
1026
- "epoch": 0.68,
1027
- "learning_rate": 0.0,
1028
- "loss": 0.0,
1029
- "step": 850
1030
- },
1031
- {
1032
- "epoch": 0.68,
1033
- "learning_rate": 0.0,
1034
- "loss": 0.3144,
1035
- "step": 855
1036
- },
1037
- {
1038
- "epoch": 0.69,
1039
- "learning_rate": 0.0,
1040
- "loss": 0.0,
1041
- "step": 860
1042
- },
1043
- {
1044
- "epoch": 0.69,
1045
- "learning_rate": 0.0,
1046
- "loss": 7.9438,
1047
- "step": 865
1048
- },
1049
- {
1050
- "epoch": 0.7,
1051
- "learning_rate": 0.0,
1052
- "loss": 0.0,
1053
- "step": 870
1054
- },
1055
- {
1056
- "epoch": 0.7,
1057
- "learning_rate": 0.0,
1058
- "loss": 0.0865,
1059
- "step": 875
1060
- },
1061
- {
1062
- "epoch": 0.7,
1063
- "learning_rate": 0.0,
1064
- "loss": 0.0,
1065
- "step": 880
1066
- },
1067
- {
1068
- "epoch": 0.71,
1069
- "learning_rate": 0.0,
1070
- "loss": 57.0144,
1071
- "step": 885
1072
- },
1073
- {
1074
- "epoch": 0.71,
1075
- "learning_rate": 0.0,
1076
- "loss": 0.0,
1077
- "step": 890
1078
- },
1079
- {
1080
- "epoch": 0.72,
1081
- "learning_rate": 0.0,
1082
- "loss": 3.5388,
1083
- "step": 895
1084
- },
1085
  {
1086
  "epoch": 0.72,
1087
- "learning_rate": 0.0,
1088
- "loss": 0.0,
1089
  "step": 900
1090
  },
1091
- {
1092
- "epoch": 0.72,
1093
- "learning_rate": 0.0,
1094
- "loss": 0.0,
1095
- "step": 905
1096
- },
1097
- {
1098
- "epoch": 0.73,
1099
- "learning_rate": 0.0,
1100
- "loss": 0.9524,
1101
- "step": 910
1102
- },
1103
- {
1104
- "epoch": 0.73,
1105
- "learning_rate": 0.0,
1106
- "loss": 0.0,
1107
- "step": 915
1108
- },
1109
- {
1110
- "epoch": 0.74,
1111
- "learning_rate": 0.0,
1112
- "loss": 0.3319,
1113
- "step": 920
1114
- },
1115
- {
1116
- "epoch": 0.74,
1117
- "learning_rate": 0.0,
1118
- "loss": 0.0,
1119
- "step": 925
1120
- },
1121
- {
1122
- "epoch": 0.74,
1123
- "learning_rate": 0.0,
1124
- "loss": 0.1917,
1125
- "step": 930
1126
- },
1127
- {
1128
- "epoch": 0.75,
1129
- "learning_rate": 0.0,
1130
- "loss": 0.2902,
1131
- "step": 935
1132
- },
1133
- {
1134
- "epoch": 0.75,
1135
- "learning_rate": 0.0,
1136
- "loss": 0.0,
1137
- "step": 940
1138
- },
1139
- {
1140
- "epoch": 0.76,
1141
- "learning_rate": 0.0,
1142
- "loss": 0.0,
1143
- "step": 945
1144
- },
1145
- {
1146
- "epoch": 0.76,
1147
- "learning_rate": 0.0,
1148
- "loss": 12.5223,
1149
- "step": 950
1150
- },
1151
- {
1152
- "epoch": 0.76,
1153
- "learning_rate": 0.0,
1154
- "loss": 0.0,
1155
- "step": 955
1156
- },
1157
- {
1158
- "epoch": 0.77,
1159
- "learning_rate": 0.0,
1160
- "loss": 0.0,
1161
- "step": 960
1162
- },
1163
- {
1164
- "epoch": 0.77,
1165
- "learning_rate": 0.0,
1166
- "loss": 0.8581,
1167
- "step": 965
1168
- },
1169
- {
1170
- "epoch": 0.78,
1171
- "learning_rate": 0.0,
1172
- "loss": 3.7479,
1173
- "step": 970
1174
- },
1175
- {
1176
- "epoch": 0.78,
1177
- "learning_rate": 0.0,
1178
- "loss": 2.8505,
1179
- "step": 975
1180
- },
1181
- {
1182
- "epoch": 0.78,
1183
- "learning_rate": 0.0,
1184
- "loss": 0.0,
1185
- "step": 980
1186
- },
1187
- {
1188
- "epoch": 0.79,
1189
- "learning_rate": 0.0,
1190
- "loss": 0.0,
1191
- "step": 985
1192
- },
1193
- {
1194
- "epoch": 0.79,
1195
- "learning_rate": 0.0,
1196
- "loss": 0.0,
1197
- "step": 990
1198
- },
1199
- {
1200
- "epoch": 0.8,
1201
- "learning_rate": 0.0,
1202
- "loss": 0.0,
1203
- "step": 995
1204
- },
1205
  {
1206
  "epoch": 0.8,
1207
- "learning_rate": 0.0,
1208
- "loss": 23.323,
1209
  "step": 1000
1210
  }
1211
  ],
1212
- "logging_steps": 5,
1213
- "max_steps": 1250,
1214
- "num_train_epochs": 1,
1215
  "save_steps": 500,
1216
- "total_flos": 8.1294117568512e+16,
1217
  "trial_name": null,
1218
  "trial_params": null
1219
  }
 
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.08,
13
+ "learning_rate": 0.0002,
14
+ "loss": 12.1569,
15
+ "step": 100
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  },
17
  {
18
+ "epoch": 0.16,
19
+ "learning_rate": 0.0002,
20
+ "loss": 0.0485,
21
+ "step": 200
22
  },
23
  {
24
+ "epoch": 0.24,
25
+ "learning_rate": 0.0002,
26
+ "loss": 5.1593,
27
+ "step": 300
28
  },
29
  {
30
+ "epoch": 0.32,
31
+ "learning_rate": 0.0002,
32
  "loss": 0.0,
33
+ "step": 400
34
  },
35
  {
36
+ "epoch": 0.4,
37
+ "learning_rate": 0.0002,
38
+ "loss": 0.116,
39
+ "step": 500
40
  },
41
  {
42
  "epoch": 0.48,
43
+ "learning_rate": 0.0002,
44
+ "loss": 0.2684,
45
  "step": 600
46
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  {
48
  "epoch": 0.56,
49
+ "learning_rate": 0.0002,
50
+ "loss": 0.0943,
 
 
 
 
 
 
51
  "step": 700
52
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  {
54
  "epoch": 0.64,
55
+ "learning_rate": 0.0002,
56
+ "loss": 29.8504,
57
  "step": 800
58
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  {
60
  "epoch": 0.72,
61
+ "learning_rate": 0.0002,
62
+ "loss": 0.0173,
63
  "step": 900
64
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
  {
66
  "epoch": 0.8,
67
+ "learning_rate": 0.0002,
68
+ "loss": 2.6532,
69
  "step": 1000
70
  }
71
  ],
72
+ "logging_steps": 100,
73
+ "max_steps": 2500,
74
+ "num_train_epochs": 2,
75
  "save_steps": 500,
76
+ "total_flos": 9.53873399808e+16,
77
  "trial_name": null,
78
  "trial_params": null
79
  }
checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35524fda833c38ef50faf45f12fec306c6d60c0132ab7af4aa9b0d8e7122f576
3
  size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c3a137e083a7c8273d2d727c33c529173121393f53a8a5be425d8a9c723d90
3
  size 4536
checkpoint-1500/README.md ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: jimboHsueh/save_hw3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ## Training procedure
203
+
204
+
205
+ The following `bitsandbytes` quantization config was used during training:
206
+ - quant_method: bitsandbytes
207
+ - load_in_8bit: False
208
+ - load_in_4bit: True
209
+ - llm_int8_threshold: 6.0
210
+ - llm_int8_skip_modules: None
211
+ - llm_int8_enable_fp32_cpu_offload: False
212
+ - llm_int8_has_fp16_weight: False
213
+ - bnb_4bit_quant_type: nf4
214
+ - bnb_4bit_use_double_quant: True
215
+ - bnb_4bit_compute_dtype: bfloat16
216
+
217
+ ### Framework versions
218
+
219
+
220
+ - PEFT 0.6.2
checkpoint-1500/adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "jimboHsueh/save_hw3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 32,
12
+ "lora_dropout": 0.1,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 32,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
+ ],
22
+ "task_type": "CAUSAL_LM"
23
+ }
checkpoint-1500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fccc42f58dd6336fe96ad118dab74bd05fc22a9ec9e489b7c256c1d765119072
3
+ size 67126104
checkpoint-1500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69ae342b4b17dffa0a39133642cb4bbf35a75e266c539638c16718cb4101dbc
3
+ size 1384
checkpoint-1500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7c16774f406d14427ecacf04f255340d4fed60303892856cafcb492a79b0bbe6
3
+ size 14244
checkpoint-1500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78da2ad4b9ca31898b45b1311417fe63cc13ba94ce9477e40d7f195675677450
3
+ size 1064
checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.2,
5
+ "eval_steps": 500,
6
+ "global_step": 1500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "learning_rate": 0.0002,
14
+ "loss": 12.1569,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.16,
19
+ "learning_rate": 0.0002,
20
+ "loss": 0.0485,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.24,
25
+ "learning_rate": 0.0002,
26
+ "loss": 5.1593,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 0.32,
31
+ "learning_rate": 0.0002,
32
+ "loss": 0.0,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 0.4,
37
+ "learning_rate": 0.0002,
38
+ "loss": 0.116,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.48,
43
+ "learning_rate": 0.0002,
44
+ "loss": 0.2684,
45
+ "step": 600
46
+ },
47
+ {
48
+ "epoch": 0.56,
49
+ "learning_rate": 0.0002,
50
+ "loss": 0.0943,
51
+ "step": 700
52
+ },
53
+ {
54
+ "epoch": 0.64,
55
+ "learning_rate": 0.0002,
56
+ "loss": 29.8504,
57
+ "step": 800
58
+ },
59
+ {
60
+ "epoch": 0.72,
61
+ "learning_rate": 0.0002,
62
+ "loss": 0.0173,
63
+ "step": 900
64
+ },
65
+ {
66
+ "epoch": 0.8,
67
+ "learning_rate": 0.0002,
68
+ "loss": 2.6532,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 0.88,
73
+ "learning_rate": 0.0002,
74
+ "loss": 0.6314,
75
+ "step": 1100
76
+ },
77
+ {
78
+ "epoch": 0.96,
79
+ "learning_rate": 0.0002,
80
+ "loss": 0.0475,
81
+ "step": 1200
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "learning_rate": 0.0002,
86
+ "loss": 0.0,
87
+ "step": 1300
88
+ },
89
+ {
90
+ "epoch": 1.12,
91
+ "learning_rate": 0.0002,
92
+ "loss": 0.9073,
93
+ "step": 1400
94
+ },
95
+ {
96
+ "epoch": 1.2,
97
+ "learning_rate": 0.0002,
98
+ "loss": 0.0704,
99
+ "step": 1500
100
+ }
101
+ ],
102
+ "logging_steps": 100,
103
+ "max_steps": 2500,
104
+ "num_train_epochs": 2,
105
+ "save_steps": 500,
106
+ "total_flos": 1.430810099712e+17,
107
+ "trial_name": null,
108
+ "trial_params": null
109
+ }
checkpoint-1500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c3a137e083a7c8273d2d727c33c529173121393f53a8a5be425d8a9c723d90
3
+ size 4536
checkpoint-2000/README.md ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: jimboHsueh/save_hw3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ## Training procedure
203
+
204
+
205
+ The following `bitsandbytes` quantization config was used during training:
206
+ - quant_method: bitsandbytes
207
+ - load_in_8bit: False
208
+ - load_in_4bit: True
209
+ - llm_int8_threshold: 6.0
210
+ - llm_int8_skip_modules: None
211
+ - llm_int8_enable_fp32_cpu_offload: False
212
+ - llm_int8_has_fp16_weight: False
213
+ - bnb_4bit_quant_type: nf4
214
+ - bnb_4bit_use_double_quant: True
215
+ - bnb_4bit_compute_dtype: bfloat16
216
+
217
+ ### Framework versions
218
+
219
+
220
+ - PEFT 0.6.2
checkpoint-2000/adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "jimboHsueh/save_hw3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 32,
12
+ "lora_dropout": 0.1,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 32,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
+ ],
22
+ "task_type": "CAUSAL_LM"
23
+ }
checkpoint-2000/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fccc42f58dd6336fe96ad118dab74bd05fc22a9ec9e489b7c256c1d765119072
3
+ size 67126104
checkpoint-2000/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69ae342b4b17dffa0a39133642cb4bbf35a75e266c539638c16718cb4101dbc
3
+ size 1384
checkpoint-2000/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a52eec92fbc1f4ab56141731bd612dd26c92b5593b2bb1aa837627246de0cda
3
+ size 14244
checkpoint-2000/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78da2ad4b9ca31898b45b1311417fe63cc13ba94ce9477e40d7f195675677450
3
+ size 1064
checkpoint-2000/trainer_state.json ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.6,
5
+ "eval_steps": 500,
6
+ "global_step": 2000,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "learning_rate": 0.0002,
14
+ "loss": 12.1569,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.16,
19
+ "learning_rate": 0.0002,
20
+ "loss": 0.0485,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.24,
25
+ "learning_rate": 0.0002,
26
+ "loss": 5.1593,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 0.32,
31
+ "learning_rate": 0.0002,
32
+ "loss": 0.0,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 0.4,
37
+ "learning_rate": 0.0002,
38
+ "loss": 0.116,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.48,
43
+ "learning_rate": 0.0002,
44
+ "loss": 0.2684,
45
+ "step": 600
46
+ },
47
+ {
48
+ "epoch": 0.56,
49
+ "learning_rate": 0.0002,
50
+ "loss": 0.0943,
51
+ "step": 700
52
+ },
53
+ {
54
+ "epoch": 0.64,
55
+ "learning_rate": 0.0002,
56
+ "loss": 29.8504,
57
+ "step": 800
58
+ },
59
+ {
60
+ "epoch": 0.72,
61
+ "learning_rate": 0.0002,
62
+ "loss": 0.0173,
63
+ "step": 900
64
+ },
65
+ {
66
+ "epoch": 0.8,
67
+ "learning_rate": 0.0002,
68
+ "loss": 2.6532,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 0.88,
73
+ "learning_rate": 0.0002,
74
+ "loss": 0.6314,
75
+ "step": 1100
76
+ },
77
+ {
78
+ "epoch": 0.96,
79
+ "learning_rate": 0.0002,
80
+ "loss": 0.0475,
81
+ "step": 1200
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "learning_rate": 0.0002,
86
+ "loss": 0.0,
87
+ "step": 1300
88
+ },
89
+ {
90
+ "epoch": 1.12,
91
+ "learning_rate": 0.0002,
92
+ "loss": 0.9073,
93
+ "step": 1400
94
+ },
95
+ {
96
+ "epoch": 1.2,
97
+ "learning_rate": 0.0002,
98
+ "loss": 0.0704,
99
+ "step": 1500
100
+ },
101
+ {
102
+ "epoch": 1.28,
103
+ "learning_rate": 0.0002,
104
+ "loss": 21.957,
105
+ "step": 1600
106
+ },
107
+ {
108
+ "epoch": 1.36,
109
+ "learning_rate": 0.0002,
110
+ "loss": 1247.1927,
111
+ "step": 1700
112
+ },
113
+ {
114
+ "epoch": 1.44,
115
+ "learning_rate": 0.0002,
116
+ "loss": 0.0146,
117
+ "step": 1800
118
+ },
119
+ {
120
+ "epoch": 1.52,
121
+ "learning_rate": 0.0002,
122
+ "loss": 0.6174,
123
+ "step": 1900
124
+ },
125
+ {
126
+ "epoch": 1.6,
127
+ "learning_rate": 0.0002,
128
+ "loss": 1.1367,
129
+ "step": 2000
130
+ }
131
+ ],
132
+ "logging_steps": 100,
133
+ "max_steps": 2500,
134
+ "num_train_epochs": 2,
135
+ "save_steps": 500,
136
+ "total_flos": 1.907746799616e+17,
137
+ "trial_name": null,
138
+ "trial_params": null
139
+ }
checkpoint-2000/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c3a137e083a7c8273d2d727c33c529173121393f53a8a5be425d8a9c723d90
3
+ size 4536
checkpoint-2500/README.md ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: jimboHsueh/save_hw3
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+
201
+
202
+ ## Training procedure
203
+
204
+
205
+ The following `bitsandbytes` quantization config was used during training:
206
+ - quant_method: bitsandbytes
207
+ - load_in_8bit: False
208
+ - load_in_4bit: True
209
+ - llm_int8_threshold: 6.0
210
+ - llm_int8_skip_modules: None
211
+ - llm_int8_enable_fp32_cpu_offload: False
212
+ - llm_int8_has_fp16_weight: False
213
+ - bnb_4bit_quant_type: nf4
214
+ - bnb_4bit_use_double_quant: True
215
+ - bnb_4bit_compute_dtype: bfloat16
216
+
217
+ ### Framework versions
218
+
219
+
220
+ - PEFT 0.6.2
checkpoint-2500/adapter_config.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "jimboHsueh/save_hw3",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "lora_alpha": 32,
12
+ "lora_dropout": 0.1,
13
+ "modules_to_save": null,
14
+ "peft_type": "LORA",
15
+ "r": 32,
16
+ "rank_pattern": {},
17
+ "revision": null,
18
+ "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
+ ],
22
+ "task_type": "CAUSAL_LM"
23
+ }
checkpoint-2500/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fccc42f58dd6336fe96ad118dab74bd05fc22a9ec9e489b7c256c1d765119072
3
+ size 67126104
checkpoint-2500/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69ae342b4b17dffa0a39133642cb4bbf35a75e266c539638c16718cb4101dbc
3
+ size 1384
checkpoint-2500/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:400541fa451c55ad6260c1903ddda485f18c5c55667971a55dedcd866ca06a1c
3
+ size 14244
checkpoint-2500/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78da2ad4b9ca31898b45b1311417fe63cc13ba94ce9477e40d7f195675677450
3
+ size 1064
checkpoint-2500/trainer_state.json ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.0,
5
+ "eval_steps": 500,
6
+ "global_step": 2500,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.08,
13
+ "learning_rate": 0.0002,
14
+ "loss": 12.1569,
15
+ "step": 100
16
+ },
17
+ {
18
+ "epoch": 0.16,
19
+ "learning_rate": 0.0002,
20
+ "loss": 0.0485,
21
+ "step": 200
22
+ },
23
+ {
24
+ "epoch": 0.24,
25
+ "learning_rate": 0.0002,
26
+ "loss": 5.1593,
27
+ "step": 300
28
+ },
29
+ {
30
+ "epoch": 0.32,
31
+ "learning_rate": 0.0002,
32
+ "loss": 0.0,
33
+ "step": 400
34
+ },
35
+ {
36
+ "epoch": 0.4,
37
+ "learning_rate": 0.0002,
38
+ "loss": 0.116,
39
+ "step": 500
40
+ },
41
+ {
42
+ "epoch": 0.48,
43
+ "learning_rate": 0.0002,
44
+ "loss": 0.2684,
45
+ "step": 600
46
+ },
47
+ {
48
+ "epoch": 0.56,
49
+ "learning_rate": 0.0002,
50
+ "loss": 0.0943,
51
+ "step": 700
52
+ },
53
+ {
54
+ "epoch": 0.64,
55
+ "learning_rate": 0.0002,
56
+ "loss": 29.8504,
57
+ "step": 800
58
+ },
59
+ {
60
+ "epoch": 0.72,
61
+ "learning_rate": 0.0002,
62
+ "loss": 0.0173,
63
+ "step": 900
64
+ },
65
+ {
66
+ "epoch": 0.8,
67
+ "learning_rate": 0.0002,
68
+ "loss": 2.6532,
69
+ "step": 1000
70
+ },
71
+ {
72
+ "epoch": 0.88,
73
+ "learning_rate": 0.0002,
74
+ "loss": 0.6314,
75
+ "step": 1100
76
+ },
77
+ {
78
+ "epoch": 0.96,
79
+ "learning_rate": 0.0002,
80
+ "loss": 0.0475,
81
+ "step": 1200
82
+ },
83
+ {
84
+ "epoch": 1.04,
85
+ "learning_rate": 0.0002,
86
+ "loss": 0.0,
87
+ "step": 1300
88
+ },
89
+ {
90
+ "epoch": 1.12,
91
+ "learning_rate": 0.0002,
92
+ "loss": 0.9073,
93
+ "step": 1400
94
+ },
95
+ {
96
+ "epoch": 1.2,
97
+ "learning_rate": 0.0002,
98
+ "loss": 0.0704,
99
+ "step": 1500
100
+ },
101
+ {
102
+ "epoch": 1.28,
103
+ "learning_rate": 0.0002,
104
+ "loss": 21.957,
105
+ "step": 1600
106
+ },
107
+ {
108
+ "epoch": 1.36,
109
+ "learning_rate": 0.0002,
110
+ "loss": 1247.1927,
111
+ "step": 1700
112
+ },
113
+ {
114
+ "epoch": 1.44,
115
+ "learning_rate": 0.0002,
116
+ "loss": 0.0146,
117
+ "step": 1800
118
+ },
119
+ {
120
+ "epoch": 1.52,
121
+ "learning_rate": 0.0002,
122
+ "loss": 0.6174,
123
+ "step": 1900
124
+ },
125
+ {
126
+ "epoch": 1.6,
127
+ "learning_rate": 0.0002,
128
+ "loss": 1.1367,
129
+ "step": 2000
130
+ },
131
+ {
132
+ "epoch": 1.68,
133
+ "learning_rate": 0.0002,
134
+ "loss": 280.2191,
135
+ "step": 2100
136
+ },
137
+ {
138
+ "epoch": 1.76,
139
+ "learning_rate": 0.0002,
140
+ "loss": 3.5996,
141
+ "step": 2200
142
+ },
143
+ {
144
+ "epoch": 1.84,
145
+ "learning_rate": 0.0002,
146
+ "loss": 0.0,
147
+ "step": 2300
148
+ },
149
+ {
150
+ "epoch": 1.92,
151
+ "learning_rate": 0.0002,
152
+ "loss": 89.4102,
153
+ "step": 2400
154
+ },
155
+ {
156
+ "epoch": 2.0,
157
+ "learning_rate": 0.0002,
158
+ "loss": 3.0508,
159
+ "step": 2500
160
+ }
161
+ ],
162
+ "logging_steps": 100,
163
+ "max_steps": 2500,
164
+ "num_train_epochs": 2,
165
+ "save_steps": 500,
166
+ "total_flos": 2.38468349952e+17,
167
+ "trial_name": null,
168
+ "trial_params": null
169
+ }
checkpoint-2500/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c3a137e083a7c8273d2d727c33c529173121393f53a8a5be425d8a9c723d90
3
+ size 4536
checkpoint-500/README.md CHANGED
@@ -236,4 +236,23 @@ The following `bitsandbytes` quantization config was used during training:
236
  ### Framework versions
237
 
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  - PEFT 0.6.2
 
236
  ### Framework versions
237
 
238
 
239
+ - PEFT 0.6.2
240
+ ## Training procedure
241
+
242
+
243
+ The following `bitsandbytes` quantization config was used during training:
244
+ - quant_method: bitsandbytes
245
+ - load_in_8bit: False
246
+ - load_in_4bit: True
247
+ - llm_int8_threshold: 6.0
248
+ - llm_int8_skip_modules: None
249
+ - llm_int8_enable_fp32_cpu_offload: False
250
+ - llm_int8_has_fp16_weight: False
251
+ - bnb_4bit_quant_type: nf4
252
+ - bnb_4bit_use_double_quant: True
253
+ - bnb_4bit_compute_dtype: bfloat16
254
+
255
+ ### Framework versions
256
+
257
+
258
  - PEFT 0.6.2
checkpoint-500/adapter_config.json CHANGED
@@ -12,12 +12,12 @@
12
  "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
- "r": 16,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
- "q_proj",
20
- "v_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
 
12
  "lora_dropout": 0.1,
13
  "modules_to_save": null,
14
  "peft_type": "LORA",
15
+ "r": 32,
16
  "rank_pattern": {},
17
  "revision": null,
18
  "target_modules": [
19
+ "v_proj",
20
+ "q_proj"
21
  ],
22
  "task_type": "CAUSAL_LM"
23
  }
checkpoint-500/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:90d6b42ca9aab52aa9058699448f12cbfcaf4d7c059b132441ea740b33a4e61d
3
- size 33571624
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fccc42f58dd6336fe96ad118dab74bd05fc22a9ec9e489b7c256c1d765119072
3
+ size 67126104
checkpoint-500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6b5914e8a93a3cec1246ecb07f6955a8013d19382d7c7b2b998021a762720631
3
  size 1384
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e69ae342b4b17dffa0a39133642cb4bbf35a75e266c539638c16718cb4101dbc
3
  size 1384
checkpoint-500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:72008a0b2f157aded0f8d0e98e505d96bab8c0021696c8bd0088ee66a3f83171
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c723286f7b841691f53f028eb9e6671f4066b403942269558ec2d70da292ba3
3
  size 14244
checkpoint-500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:219c62af0a99d0a3db4c57e4189ef3b142499e30571ad5cc800b8b67c9ce1583
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:78da2ad4b9ca31898b45b1311417fe63cc13ba94ce9477e40d7f195675677450
3
  size 1064
checkpoint-500/trainer_state.json CHANGED
@@ -8,612 +8,42 @@
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
- {
12
- "epoch": 0.0,
13
- "learning_rate": 0.0,
14
- "loss": 0.0,
15
- "step": 5
16
- },
17
- {
18
- "epoch": 0.01,
19
- "learning_rate": 0.0,
20
- "loss": 0.0,
21
- "step": 10
22
- },
23
- {
24
- "epoch": 0.01,
25
- "learning_rate": 0.0,
26
- "loss": 16.3471,
27
- "step": 15
28
- },
29
- {
30
- "epoch": 0.02,
31
- "learning_rate": 0.0,
32
- "loss": 0.0,
33
- "step": 20
34
- },
35
- {
36
- "epoch": 0.02,
37
- "learning_rate": 0.0,
38
- "loss": 0.0,
39
- "step": 25
40
- },
41
- {
42
- "epoch": 0.02,
43
- "learning_rate": 0.0,
44
- "loss": 0.4615,
45
- "step": 30
46
- },
47
- {
48
- "epoch": 0.03,
49
- "learning_rate": 0.0,
50
- "loss": 0.194,
51
- "step": 35
52
- },
53
- {
54
- "epoch": 0.03,
55
- "learning_rate": 0.0,
56
- "loss": 0.0,
57
- "step": 40
58
- },
59
- {
60
- "epoch": 0.04,
61
- "learning_rate": 0.0,
62
- "loss": 0.0,
63
- "step": 45
64
- },
65
- {
66
- "epoch": 0.04,
67
- "learning_rate": 0.0,
68
- "loss": 0.9551,
69
- "step": 50
70
- },
71
- {
72
- "epoch": 0.04,
73
- "learning_rate": 0.0,
74
- "loss": 0.0,
75
- "step": 55
76
- },
77
- {
78
- "epoch": 0.05,
79
- "learning_rate": 0.0,
80
- "loss": 0.2261,
81
- "step": 60
82
- },
83
- {
84
- "epoch": 0.05,
85
- "learning_rate": 0.0,
86
- "loss": 0.0,
87
- "step": 65
88
- },
89
- {
90
- "epoch": 0.06,
91
- "learning_rate": 0.0,
92
- "loss": 0.0,
93
- "step": 70
94
- },
95
- {
96
- "epoch": 0.06,
97
- "learning_rate": 0.0,
98
- "loss": 0.0,
99
- "step": 75
100
- },
101
- {
102
- "epoch": 0.06,
103
- "learning_rate": 0.0,
104
- "loss": 0.0,
105
- "step": 80
106
- },
107
- {
108
- "epoch": 0.07,
109
- "learning_rate": 0.0,
110
- "loss": 1.9869,
111
- "step": 85
112
- },
113
- {
114
- "epoch": 0.07,
115
- "learning_rate": 0.0,
116
- "loss": 0.0,
117
- "step": 90
118
- },
119
  {
120
  "epoch": 0.08,
121
- "learning_rate": 0.0,
122
- "loss": 0.0,
123
- "step": 95
124
- },
125
- {
126
- "epoch": 0.08,
127
- "learning_rate": 0.0,
128
- "loss": 1.6293,
129
  "step": 100
130
  },
131
- {
132
- "epoch": 0.08,
133
- "learning_rate": 0.0,
134
- "loss": 0.0,
135
- "step": 105
136
- },
137
- {
138
- "epoch": 0.09,
139
- "learning_rate": 0.0,
140
- "loss": 0.0,
141
- "step": 110
142
- },
143
- {
144
- "epoch": 0.09,
145
- "learning_rate": 0.0,
146
- "loss": 0.0,
147
- "step": 115
148
- },
149
- {
150
- "epoch": 0.1,
151
- "learning_rate": 0.0,
152
- "loss": 2.0245,
153
- "step": 120
154
- },
155
- {
156
- "epoch": 0.1,
157
- "learning_rate": 0.0,
158
- "loss": 0.0,
159
- "step": 125
160
- },
161
- {
162
- "epoch": 0.1,
163
- "learning_rate": 0.0,
164
- "loss": 0.0,
165
- "step": 130
166
- },
167
- {
168
- "epoch": 0.11,
169
- "learning_rate": 0.0,
170
- "loss": 0.0,
171
- "step": 135
172
- },
173
- {
174
- "epoch": 0.11,
175
- "learning_rate": 0.0,
176
- "loss": 1.8265,
177
- "step": 140
178
- },
179
- {
180
- "epoch": 0.12,
181
- "learning_rate": 0.0,
182
- "loss": 0.0,
183
- "step": 145
184
- },
185
- {
186
- "epoch": 0.12,
187
- "learning_rate": 0.0,
188
- "loss": 0.5536,
189
- "step": 150
190
- },
191
- {
192
- "epoch": 0.12,
193
- "learning_rate": 0.0,
194
- "loss": 0.0,
195
- "step": 155
196
- },
197
- {
198
- "epoch": 0.13,
199
- "learning_rate": 0.0,
200
- "loss": 2.0011,
201
- "step": 160
202
- },
203
- {
204
- "epoch": 0.13,
205
- "learning_rate": 0.0,
206
- "loss": 0.0,
207
- "step": 165
208
- },
209
- {
210
- "epoch": 0.14,
211
- "learning_rate": 0.0,
212
- "loss": 0.0,
213
- "step": 170
214
- },
215
- {
216
- "epoch": 0.14,
217
- "learning_rate": 0.0,
218
- "loss": 0.1846,
219
- "step": 175
220
- },
221
- {
222
- "epoch": 0.14,
223
- "learning_rate": 0.0,
224
- "loss": 2.3248,
225
- "step": 180
226
- },
227
- {
228
- "epoch": 0.15,
229
- "learning_rate": 0.0,
230
- "loss": 3.966,
231
- "step": 185
232
- },
233
- {
234
- "epoch": 0.15,
235
- "learning_rate": 0.0,
236
- "loss": 0.0,
237
- "step": 190
238
- },
239
- {
240
- "epoch": 0.16,
241
- "learning_rate": 0.0,
242
- "loss": 0.0,
243
- "step": 195
244
- },
245
  {
246
  "epoch": 0.16,
247
- "learning_rate": 0.0,
248
- "loss": 49.8341,
249
  "step": 200
250
  },
251
- {
252
- "epoch": 0.16,
253
- "learning_rate": 0.0,
254
- "loss": 0.0,
255
- "step": 205
256
- },
257
- {
258
- "epoch": 0.17,
259
- "learning_rate": 0.0,
260
- "loss": 0.0,
261
- "step": 210
262
- },
263
- {
264
- "epoch": 0.17,
265
- "learning_rate": 0.0,
266
- "loss": 0.0,
267
- "step": 215
268
- },
269
- {
270
- "epoch": 0.18,
271
- "learning_rate": 0.0,
272
- "loss": 4.7387,
273
- "step": 220
274
- },
275
- {
276
- "epoch": 0.18,
277
- "learning_rate": 0.0,
278
- "loss": 27.6091,
279
- "step": 225
280
- },
281
- {
282
- "epoch": 0.18,
283
- "learning_rate": 0.0,
284
- "loss": 0.0,
285
- "step": 230
286
- },
287
- {
288
- "epoch": 0.19,
289
- "learning_rate": 0.0,
290
- "loss": 0.0,
291
- "step": 235
292
- },
293
- {
294
- "epoch": 0.19,
295
- "learning_rate": 0.0,
296
- "loss": 0.0,
297
- "step": 240
298
- },
299
- {
300
- "epoch": 0.2,
301
- "learning_rate": 0.0,
302
- "loss": 0.7769,
303
- "step": 245
304
- },
305
- {
306
- "epoch": 0.2,
307
- "learning_rate": 0.0,
308
- "loss": 1.2832,
309
- "step": 250
310
- },
311
- {
312
- "epoch": 0.2,
313
- "learning_rate": 0.0,
314
- "loss": 28.6253,
315
- "step": 255
316
- },
317
- {
318
- "epoch": 0.21,
319
- "learning_rate": 0.0,
320
- "loss": 0.0,
321
- "step": 260
322
- },
323
- {
324
- "epoch": 0.21,
325
- "learning_rate": 0.0,
326
- "loss": 1.311,
327
- "step": 265
328
- },
329
- {
330
- "epoch": 0.22,
331
- "learning_rate": 0.0,
332
- "loss": 0.0,
333
- "step": 270
334
- },
335
- {
336
- "epoch": 0.22,
337
- "learning_rate": 0.0,
338
- "loss": 0.0,
339
- "step": 275
340
- },
341
- {
342
- "epoch": 0.22,
343
- "learning_rate": 0.0,
344
- "loss": 0.0,
345
- "step": 280
346
- },
347
- {
348
- "epoch": 0.23,
349
- "learning_rate": 0.0,
350
- "loss": 0.0,
351
- "step": 285
352
- },
353
- {
354
- "epoch": 0.23,
355
- "learning_rate": 0.0,
356
- "loss": 1.0543,
357
- "step": 290
358
- },
359
- {
360
- "epoch": 0.24,
361
- "learning_rate": 0.0,
362
- "loss": 0.0,
363
- "step": 295
364
- },
365
  {
366
  "epoch": 0.24,
367
- "learning_rate": 0.0,
368
- "loss": 0.0,
369
  "step": 300
370
  },
371
- {
372
- "epoch": 0.24,
373
- "learning_rate": 0.0,
374
- "loss": 4.1379,
375
- "step": 305
376
- },
377
- {
378
- "epoch": 0.25,
379
- "learning_rate": 0.0,
380
- "loss": 0.0,
381
- "step": 310
382
- },
383
- {
384
- "epoch": 0.25,
385
- "learning_rate": 0.0,
386
- "loss": 0.2084,
387
- "step": 315
388
- },
389
- {
390
- "epoch": 0.26,
391
- "learning_rate": 0.0,
392
- "loss": 0.0,
393
- "step": 320
394
- },
395
- {
396
- "epoch": 0.26,
397
- "learning_rate": 0.0,
398
- "loss": 0.0999,
399
- "step": 325
400
- },
401
- {
402
- "epoch": 0.26,
403
- "learning_rate": 0.0,
404
- "loss": 0.0,
405
- "step": 330
406
- },
407
- {
408
- "epoch": 0.27,
409
- "learning_rate": 0.0,
410
- "loss": 0.0,
411
- "step": 335
412
- },
413
- {
414
- "epoch": 0.27,
415
- "learning_rate": 0.0,
416
- "loss": 0.0,
417
- "step": 340
418
- },
419
- {
420
- "epoch": 0.28,
421
- "learning_rate": 0.0,
422
- "loss": 0.2186,
423
- "step": 345
424
- },
425
- {
426
- "epoch": 0.28,
427
- "learning_rate": 0.0,
428
- "loss": 0.0,
429
- "step": 350
430
- },
431
- {
432
- "epoch": 0.28,
433
- "learning_rate": 0.0,
434
- "loss": 0.6567,
435
- "step": 355
436
- },
437
- {
438
- "epoch": 0.29,
439
- "learning_rate": 0.0,
440
- "loss": 0.1897,
441
- "step": 360
442
- },
443
- {
444
- "epoch": 0.29,
445
- "learning_rate": 0.0,
446
- "loss": 0.0,
447
- "step": 365
448
- },
449
- {
450
- "epoch": 0.3,
451
- "learning_rate": 0.0,
452
- "loss": 0.0,
453
- "step": 370
454
- },
455
- {
456
- "epoch": 0.3,
457
- "learning_rate": 0.0,
458
- "loss": 0.0,
459
- "step": 375
460
- },
461
- {
462
- "epoch": 0.3,
463
- "learning_rate": 0.0,
464
- "loss": 1.1873,
465
- "step": 380
466
- },
467
- {
468
- "epoch": 0.31,
469
- "learning_rate": 0.0,
470
- "loss": 0.5393,
471
- "step": 385
472
- },
473
- {
474
- "epoch": 0.31,
475
- "learning_rate": 0.0,
476
- "loss": 0.0,
477
- "step": 390
478
- },
479
  {
480
  "epoch": 0.32,
481
- "learning_rate": 0.0,
482
- "loss": 0.0,
483
- "step": 395
484
- },
485
- {
486
- "epoch": 0.32,
487
- "learning_rate": 0.0,
488
  "loss": 0.0,
489
  "step": 400
490
  },
491
- {
492
- "epoch": 0.32,
493
- "learning_rate": 0.0,
494
- "loss": 0.0,
495
- "step": 405
496
- },
497
- {
498
- "epoch": 0.33,
499
- "learning_rate": 0.0,
500
- "loss": 0.0,
501
- "step": 410
502
- },
503
- {
504
- "epoch": 0.33,
505
- "learning_rate": 0.0,
506
- "loss": 0.0,
507
- "step": 415
508
- },
509
- {
510
- "epoch": 0.34,
511
- "learning_rate": 0.0,
512
- "loss": 0.0,
513
- "step": 420
514
- },
515
- {
516
- "epoch": 0.34,
517
- "learning_rate": 0.0,
518
- "loss": 0.0,
519
- "step": 425
520
- },
521
- {
522
- "epoch": 0.34,
523
- "learning_rate": 0.0,
524
- "loss": 0.0,
525
- "step": 430
526
- },
527
- {
528
- "epoch": 0.35,
529
- "learning_rate": 0.0,
530
- "loss": 0.0,
531
- "step": 435
532
- },
533
- {
534
- "epoch": 0.35,
535
- "learning_rate": 0.0,
536
- "loss": 0.0,
537
- "step": 440
538
- },
539
- {
540
- "epoch": 0.36,
541
- "learning_rate": 0.0,
542
- "loss": 0.0,
543
- "step": 445
544
- },
545
- {
546
- "epoch": 0.36,
547
- "learning_rate": 0.0,
548
- "loss": 0.0,
549
- "step": 450
550
- },
551
- {
552
- "epoch": 0.36,
553
- "learning_rate": 0.0,
554
- "loss": 0.6623,
555
- "step": 455
556
- },
557
- {
558
- "epoch": 0.37,
559
- "learning_rate": 0.0,
560
- "loss": 7.193,
561
- "step": 460
562
- },
563
- {
564
- "epoch": 0.37,
565
- "learning_rate": 0.0,
566
- "loss": 62.3001,
567
- "step": 465
568
- },
569
- {
570
- "epoch": 0.38,
571
- "learning_rate": 0.0,
572
- "loss": 1.6018,
573
- "step": 470
574
- },
575
- {
576
- "epoch": 0.38,
577
- "learning_rate": 0.0,
578
- "loss": 0.0,
579
- "step": 475
580
- },
581
- {
582
- "epoch": 0.38,
583
- "learning_rate": 0.0,
584
- "loss": 0.0,
585
- "step": 480
586
- },
587
- {
588
- "epoch": 0.39,
589
- "learning_rate": 0.0,
590
- "loss": 0.1079,
591
- "step": 485
592
- },
593
- {
594
- "epoch": 0.39,
595
- "learning_rate": 0.0,
596
- "loss": 0.0,
597
- "step": 490
598
- },
599
  {
600
  "epoch": 0.4,
601
- "learning_rate": 0.0,
602
- "loss": 1.2356,
603
- "step": 495
604
- },
605
- {
606
- "epoch": 0.4,
607
- "learning_rate": 0.0,
608
- "loss": 0.0,
609
  "step": 500
610
  }
611
  ],
612
- "logging_steps": 5,
613
- "max_steps": 1250,
614
- "num_train_epochs": 1,
615
  "save_steps": 500,
616
- "total_flos": 4.0647058784256e+16,
617
  "trial_name": null,
618
  "trial_params": null
619
  }
 
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  {
12
  "epoch": 0.08,
13
+ "learning_rate": 0.0002,
14
+ "loss": 12.1569,
 
 
 
 
 
 
15
  "step": 100
16
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  {
18
  "epoch": 0.16,
19
+ "learning_rate": 0.0002,
20
+ "loss": 0.0485,
21
  "step": 200
22
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  {
24
  "epoch": 0.24,
25
+ "learning_rate": 0.0002,
26
+ "loss": 5.1593,
27
  "step": 300
28
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  {
30
  "epoch": 0.32,
31
+ "learning_rate": 0.0002,
 
 
 
 
 
 
32
  "loss": 0.0,
33
  "step": 400
34
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  {
36
  "epoch": 0.4,
37
+ "learning_rate": 0.0002,
38
+ "loss": 0.116,
 
 
 
 
 
 
39
  "step": 500
40
  }
41
  ],
42
+ "logging_steps": 100,
43
+ "max_steps": 2500,
44
+ "num_train_epochs": 2,
45
  "save_steps": 500,
46
+ "total_flos": 4.76936699904e+16,
47
  "trial_name": null,
48
  "trial_params": null
49
  }
checkpoint-500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:35524fda833c38ef50faf45f12fec306c6d60c0132ab7af4aa9b0d8e7122f576
3
  size 4536
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1c3a137e083a7c8273d2d727c33c529173121393f53a8a5be425d8a9c723d90
3
  size 4536
runs/Nov22_05-17-44_3ed78d352be3/events.out.tfevents.1700630265.3ed78d352be3.3950.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b17e52eb91561bf87b37ab00d2a52848dcc60ef55e2f8726f7dd2ea1a4d609e2
3
+ size 8845