alexredna commited on
Commit
1224d95
1 Parent(s): 0d114dd

Model save

Browse files
README.md ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
4
+ tags:
5
+ - trl
6
+ - sft
7
+ - generated_from_trainer
8
+ datasets:
9
+ - generator
10
+ model-index:
11
+ - name: TinyLlama-1.1B-Chat-v1.0-reasoning-v2
12
+ results: []
13
+ ---
14
+
15
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
16
+ should probably proofread and complete it, then remove this comment. -->
17
+
18
+ # TinyLlama-1.1B-Chat-v1.0-reasoning-v2
19
+
20
+ This model is a fine-tuned version of [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0) on the generator dataset.
21
+ It achieves the following results on the evaluation set:
22
+ - Loss: 1.0186
23
+
24
+ ## Model description
25
+
26
+ More information needed
27
+
28
+ ## Intended uses & limitations
29
+
30
+ More information needed
31
+
32
+ ## Training and evaluation data
33
+
34
+ More information needed
35
+
36
+ ## Training procedure
37
+
38
+ ### Training hyperparameters
39
+
40
+ The following hyperparameters were used during training:
41
+ - learning_rate: 4e-05
42
+ - train_batch_size: 10
43
+ - eval_batch_size: 4
44
+ - seed: 42
45
+ - distributed_type: multi-GPU
46
+ - gradient_accumulation_steps: 10
47
+ - total_train_batch_size: 100
48
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
49
+ - lr_scheduler_type: cosine
50
+ - num_epochs: 4
51
+
52
+ ### Training results
53
+
54
+ | Training Loss | Epoch | Step | Validation Loss |
55
+ |:-------------:|:-----:|:----:|:---------------:|
56
+ | 1.0456 | 1.0 | 347 | 1.0360 |
57
+ | 0.9714 | 2.0 | 695 | 1.0180 |
58
+ | 0.9335 | 3.0 | 1042 | 1.0176 |
59
+ | 0.9348 | 3.99 | 1388 | 1.0186 |
60
+
61
+
62
+ ### Framework versions
63
+
64
+ - Transformers 4.36.2
65
+ - Pytorch 2.1.0+cu118
66
+ - Datasets 2.14.6
67
+ - Tokenizers 0.15.0
all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "eval_loss": 1.0186352729797363,
4
+ "eval_runtime": 19.987,
5
+ "eval_samples": 1771,
6
+ "eval_samples_per_second": 13.959,
7
+ "eval_steps_per_second": 3.502,
8
+ "train_loss": 0.9866050820185747,
9
+ "train_runtime": 37340.4665,
10
+ "train_samples": 195052,
11
+ "train_samples_per_second": 3.722,
12
+ "train_steps_per_second": 0.037
13
+ }
eval_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "eval_loss": 1.0186352729797363,
4
+ "eval_runtime": 19.987,
5
+ "eval_samples": 1771,
6
+ "eval_samples_per_second": 13.959,
7
+ "eval_steps_per_second": 3.502
8
+ }
generation_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 1,
3
+ "eos_token_id": 2,
4
+ "max_length": 2048,
5
+ "pad_token_id": 0,
6
+ "transformers_version": "4.36.2"
7
+ }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59350a8a198760ba9e28e20967e244f25980ca6022129cf9f6ad62cf72523e7b
3
  size 2200119864
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:952296589bea8d6e696a52fccedaab119430294947b362b5b2fb9a4fc2842f08
3
  size 2200119864
runs/Jan05_21-55-57_12c0322bb846/events.out.tfevents.1704492732.12c0322bb846.2137.0 CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2d4f87da1de3aeca5e9e5d7b5d2435d07867b9a99de250c25fcf0b0b63f4fa7
3
- size 16206
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5de6f9f6ecc4a43d0504931b3847421007f877901312069b22ac511fd9e8581c
3
+ size 16988
runs/Jan05_21-55-57_12c0322bb846/events.out.tfevents.1704530092.12c0322bb846.2137.1 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:342e167a1a34ad3f8f6bf2009bb137b686c9cec915b35284b5fa25946ed64100
3
+ size 359
train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.99,
3
+ "train_loss": 0.9866050820185747,
4
+ "train_runtime": 37340.4665,
5
+ "train_samples": 195052,
6
+ "train_samples_per_second": 3.722,
7
+ "train_steps_per_second": 0.037
8
+ }
trainer_state.json ADDED
@@ -0,0 +1,482 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 3.9942446043165467,
5
+ "eval_steps": 500,
6
+ "global_step": 1388,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.0,
13
+ "learning_rate": 3.999994877043978e-05,
14
+ "loss": 1.5749,
15
+ "step": 1
16
+ },
17
+ {
18
+ "epoch": 0.06,
19
+ "learning_rate": 3.997951166621575e-05,
20
+ "loss": 1.2447,
21
+ "step": 20
22
+ },
23
+ {
24
+ "epoch": 0.12,
25
+ "learning_rate": 3.9918088642045126e-05,
26
+ "loss": 1.1636,
27
+ "step": 40
28
+ },
29
+ {
30
+ "epoch": 0.17,
31
+ "learning_rate": 3.981585677303025e-05,
32
+ "loss": 1.1379,
33
+ "step": 60
34
+ },
35
+ {
36
+ "epoch": 0.23,
37
+ "learning_rate": 3.967302551523671e-05,
38
+ "loss": 1.1114,
39
+ "step": 80
40
+ },
41
+ {
42
+ "epoch": 0.29,
43
+ "learning_rate": 3.948988750611294e-05,
44
+ "loss": 1.1086,
45
+ "step": 100
46
+ },
47
+ {
48
+ "epoch": 0.35,
49
+ "learning_rate": 3.9266817964924905e-05,
50
+ "loss": 1.0915,
51
+ "step": 120
52
+ },
53
+ {
54
+ "epoch": 0.4,
55
+ "learning_rate": 3.900427392399429e-05,
56
+ "loss": 1.075,
57
+ "step": 140
58
+ },
59
+ {
60
+ "epoch": 0.46,
61
+ "learning_rate": 3.870279329231546e-05,
62
+ "loss": 1.0875,
63
+ "step": 160
64
+ },
65
+ {
66
+ "epoch": 0.52,
67
+ "learning_rate": 3.836299375346956e-05,
68
+ "loss": 1.0696,
69
+ "step": 180
70
+ },
71
+ {
72
+ "epoch": 0.58,
73
+ "learning_rate": 3.798557150009373e-05,
74
+ "loss": 1.0614,
75
+ "step": 200
76
+ },
77
+ {
78
+ "epoch": 0.63,
79
+ "learning_rate": 3.757129980749847e-05,
80
+ "loss": 1.0638,
81
+ "step": 220
82
+ },
83
+ {
84
+ "epoch": 0.69,
85
+ "learning_rate": 3.712102744935529e-05,
86
+ "loss": 1.0545,
87
+ "step": 240
88
+ },
89
+ {
90
+ "epoch": 0.75,
91
+ "learning_rate": 3.6635676958700946e-05,
92
+ "loss": 1.0508,
93
+ "step": 260
94
+ },
95
+ {
96
+ "epoch": 0.81,
97
+ "learning_rate": 3.611624273782092e-05,
98
+ "loss": 1.0566,
99
+ "step": 280
100
+ },
101
+ {
102
+ "epoch": 0.86,
103
+ "learning_rate": 3.556378902088484e-05,
104
+ "loss": 1.0577,
105
+ "step": 300
106
+ },
107
+ {
108
+ "epoch": 0.92,
109
+ "learning_rate": 3.4979447693508e-05,
110
+ "loss": 1.0428,
111
+ "step": 320
112
+ },
113
+ {
114
+ "epoch": 0.98,
115
+ "learning_rate": 3.436441597370635e-05,
116
+ "loss": 1.0456,
117
+ "step": 340
118
+ },
119
+ {
120
+ "epoch": 1.0,
121
+ "eval_loss": 1.035969614982605,
122
+ "eval_runtime": 20.059,
123
+ "eval_samples_per_second": 13.909,
124
+ "eval_steps_per_second": 3.49,
125
+ "step": 347
126
+ },
127
+ {
128
+ "epoch": 1.04,
129
+ "learning_rate": 3.371995395899618e-05,
130
+ "loss": 1.0082,
131
+ "step": 360
132
+ },
133
+ {
134
+ "epoch": 1.09,
135
+ "learning_rate": 3.304738204466437e-05,
136
+ "loss": 0.9889,
137
+ "step": 380
138
+ },
139
+ {
140
+ "epoch": 1.15,
141
+ "learning_rate": 3.234807821849838e-05,
142
+ "loss": 0.9786,
143
+ "step": 400
144
+ },
145
+ {
146
+ "epoch": 1.21,
147
+ "learning_rate": 3.162347523751894e-05,
148
+ "loss": 0.9881,
149
+ "step": 420
150
+ },
151
+ {
152
+ "epoch": 1.27,
153
+ "learning_rate": 3.0875057692499566e-05,
154
+ "loss": 0.9747,
155
+ "step": 440
156
+ },
157
+ {
158
+ "epoch": 1.32,
159
+ "learning_rate": 3.0104358966287503e-05,
160
+ "loss": 0.9842,
161
+ "step": 460
162
+ },
163
+ {
164
+ "epoch": 1.38,
165
+ "learning_rate": 2.9312958092157724e-05,
166
+ "loss": 0.9846,
167
+ "step": 480
168
+ },
169
+ {
170
+ "epoch": 1.44,
171
+ "learning_rate": 2.850247651863686e-05,
172
+ "loss": 0.9801,
173
+ "step": 500
174
+ },
175
+ {
176
+ "epoch": 1.5,
177
+ "learning_rate": 2.767457478742533e-05,
178
+ "loss": 0.9834,
179
+ "step": 520
180
+ },
181
+ {
182
+ "epoch": 1.55,
183
+ "learning_rate": 2.6830949131224118e-05,
184
+ "loss": 0.9831,
185
+ "step": 540
186
+ },
187
+ {
188
+ "epoch": 1.61,
189
+ "learning_rate": 2.5973327998436527e-05,
190
+ "loss": 0.9787,
191
+ "step": 560
192
+ },
193
+ {
194
+ "epoch": 1.67,
195
+ "learning_rate": 2.5103468511865456e-05,
196
+ "loss": 0.981,
197
+ "step": 580
198
+ },
199
+ {
200
+ "epoch": 1.73,
201
+ "learning_rate": 2.4223152868661535e-05,
202
+ "loss": 0.9845,
203
+ "step": 600
204
+ },
205
+ {
206
+ "epoch": 1.78,
207
+ "learning_rate": 2.3334184688898107e-05,
208
+ "loss": 0.9754,
209
+ "step": 620
210
+ },
211
+ {
212
+ "epoch": 1.84,
213
+ "learning_rate": 2.2438385320254234e-05,
214
+ "loss": 0.9779,
215
+ "step": 640
216
+ },
217
+ {
218
+ "epoch": 1.9,
219
+ "learning_rate": 2.1537590106376758e-05,
220
+ "loss": 0.9737,
221
+ "step": 660
222
+ },
223
+ {
224
+ "epoch": 1.96,
225
+ "learning_rate": 2.0633644626567007e-05,
226
+ "loss": 0.9714,
227
+ "step": 680
228
+ },
229
+ {
230
+ "epoch": 2.0,
231
+ "eval_loss": 1.0180176496505737,
232
+ "eval_runtime": 20.0699,
233
+ "eval_samples_per_second": 13.901,
234
+ "eval_steps_per_second": 3.488,
235
+ "step": 695
236
+ },
237
+ {
238
+ "epoch": 2.01,
239
+ "learning_rate": 1.9728400914496288e-05,
240
+ "loss": 0.9669,
241
+ "step": 700
242
+ },
243
+ {
244
+ "epoch": 2.07,
245
+ "learning_rate": 1.882371366369749e-05,
246
+ "loss": 0.9478,
247
+ "step": 720
248
+ },
249
+ {
250
+ "epoch": 2.13,
251
+ "learning_rate": 1.79214364276071e-05,
252
+ "loss": 0.9458,
253
+ "step": 740
254
+ },
255
+ {
256
+ "epoch": 2.19,
257
+ "learning_rate": 1.702341782194301e-05,
258
+ "loss": 0.9307,
259
+ "step": 760
260
+ },
261
+ {
262
+ "epoch": 2.24,
263
+ "learning_rate": 1.6131497737198942e-05,
264
+ "loss": 0.9435,
265
+ "step": 780
266
+ },
267
+ {
268
+ "epoch": 2.3,
269
+ "learning_rate": 1.5247503569015413e-05,
270
+ "loss": 0.945,
271
+ "step": 800
272
+ },
273
+ {
274
+ "epoch": 2.36,
275
+ "learning_rate": 1.437324647415053e-05,
276
+ "loss": 0.9416,
277
+ "step": 820
278
+ },
279
+ {
280
+ "epoch": 2.42,
281
+ "learning_rate": 1.3510517659721583e-05,
282
+ "loss": 0.9476,
283
+ "step": 840
284
+ },
285
+ {
286
+ "epoch": 2.47,
287
+ "learning_rate": 1.2661084713320093e-05,
288
+ "loss": 0.946,
289
+ "step": 860
290
+ },
291
+ {
292
+ "epoch": 2.53,
293
+ "learning_rate": 1.182668798151939e-05,
294
+ "loss": 0.9414,
295
+ "step": 880
296
+ },
297
+ {
298
+ "epoch": 2.59,
299
+ "learning_rate": 1.1009037004194424e-05,
300
+ "loss": 0.9439,
301
+ "step": 900
302
+ },
303
+ {
304
+ "epoch": 2.65,
305
+ "learning_rate": 1.020980701195946e-05,
306
+ "loss": 0.9486,
307
+ "step": 920
308
+ },
309
+ {
310
+ "epoch": 2.71,
311
+ "learning_rate": 9.430635493899609e-06,
312
+ "loss": 0.949,
313
+ "step": 940
314
+ },
315
+ {
316
+ "epoch": 2.76,
317
+ "learning_rate": 8.673118842628595e-06,
318
+ "loss": 0.9376,
319
+ "step": 960
320
+ },
321
+ {
322
+ "epoch": 2.82,
323
+ "learning_rate": 7.938809083546264e-06,
324
+ "loss": 0.9432,
325
+ "step": 980
326
+ },
327
+ {
328
+ "epoch": 2.88,
329
+ "learning_rate": 7.229210694997113e-06,
330
+ "loss": 0.9457,
331
+ "step": 1000
332
+ },
333
+ {
334
+ "epoch": 2.94,
335
+ "learning_rate": 6.545777525844883e-06,
336
+ "loss": 0.9357,
337
+ "step": 1020
338
+ },
339
+ {
340
+ "epoch": 2.99,
341
+ "learning_rate": 5.889909816778458e-06,
342
+ "loss": 0.9335,
343
+ "step": 1040
344
+ },
345
+ {
346
+ "epoch": 3.0,
347
+ "eval_loss": 1.0176299810409546,
348
+ "eval_runtime": 20.0069,
349
+ "eval_samples_per_second": 13.945,
350
+ "eval_steps_per_second": 3.499,
351
+ "step": 1042
352
+ },
353
+ {
354
+ "epoch": 3.05,
355
+ "learning_rate": 5.262951331452011e-06,
356
+ "loss": 0.937,
357
+ "step": 1060
358
+ },
359
+ {
360
+ "epoch": 3.11,
361
+ "learning_rate": 4.6661866033371506e-06,
362
+ "loss": 0.9351,
363
+ "step": 1080
364
+ },
365
+ {
366
+ "epoch": 3.17,
367
+ "learning_rate": 4.100838303927914e-06,
368
+ "loss": 0.9415,
369
+ "step": 1100
370
+ },
371
+ {
372
+ "epoch": 3.22,
373
+ "learning_rate": 3.5680647376905666e-06,
374
+ "loss": 0.9293,
375
+ "step": 1120
376
+ },
377
+ {
378
+ "epoch": 3.28,
379
+ "learning_rate": 3.0689574688907607e-06,
380
+ "loss": 0.9304,
381
+ "step": 1140
382
+ },
383
+ {
384
+ "epoch": 3.34,
385
+ "learning_rate": 2.604539085160218e-06,
386
+ "loss": 0.9254,
387
+ "step": 1160
388
+ },
389
+ {
390
+ "epoch": 3.4,
391
+ "learning_rate": 2.1757611023850876e-06,
392
+ "loss": 0.9293,
393
+ "step": 1180
394
+ },
395
+ {
396
+ "epoch": 3.45,
397
+ "learning_rate": 1.7835020152084116e-06,
398
+ "loss": 0.9391,
399
+ "step": 1200
400
+ },
401
+ {
402
+ "epoch": 3.51,
403
+ "learning_rate": 1.4285654971409902e-06,
404
+ "loss": 0.9363,
405
+ "step": 1220
406
+ },
407
+ {
408
+ "epoch": 3.57,
409
+ "learning_rate": 1.1116787539682571e-06,
410
+ "loss": 0.9506,
411
+ "step": 1240
412
+ },
413
+ {
414
+ "epoch": 3.63,
415
+ "learning_rate": 8.334910338268054e-07,
416
+ "loss": 0.9226,
417
+ "step": 1260
418
+ },
419
+ {
420
+ "epoch": 3.68,
421
+ "learning_rate": 5.945722970031332e-07,
422
+ "loss": 0.9305,
423
+ "step": 1280
424
+ },
425
+ {
426
+ "epoch": 3.74,
427
+ "learning_rate": 3.9541204817997283e-07,
428
+ "loss": 0.9306,
429
+ "step": 1300
430
+ },
431
+ {
432
+ "epoch": 3.8,
433
+ "learning_rate": 2.3641833352276768e-07,
434
+ "loss": 0.9344,
435
+ "step": 1320
436
+ },
437
+ {
438
+ "epoch": 3.86,
439
+ "learning_rate": 1.1791690466107286e-07,
440
+ "loss": 0.93,
441
+ "step": 1340
442
+ },
443
+ {
444
+ "epoch": 3.91,
445
+ "learning_rate": 4.0150551277724494e-08,
446
+ "loss": 0.9344,
447
+ "step": 1360
448
+ },
449
+ {
450
+ "epoch": 3.97,
451
+ "learning_rate": 3.2786036732557203e-09,
452
+ "loss": 0.9348,
453
+ "step": 1380
454
+ },
455
+ {
456
+ "epoch": 3.99,
457
+ "eval_loss": 1.0186352729797363,
458
+ "eval_runtime": 19.987,
459
+ "eval_samples_per_second": 13.959,
460
+ "eval_steps_per_second": 3.502,
461
+ "step": 1388
462
+ },
463
+ {
464
+ "epoch": 3.99,
465
+ "step": 1388,
466
+ "total_flos": 1.7642090681398723e+18,
467
+ "train_loss": 0.9866050820185747,
468
+ "train_runtime": 37340.4665,
469
+ "train_samples_per_second": 3.722,
470
+ "train_steps_per_second": 0.037
471
+ }
472
+ ],
473
+ "logging_steps": 20,
474
+ "max_steps": 1388,
475
+ "num_input_tokens_seen": 0,
476
+ "num_train_epochs": 4,
477
+ "save_steps": 80,
478
+ "total_flos": 1.7642090681398723e+18,
479
+ "train_batch_size": 10,
480
+ "trial_name": null,
481
+ "trial_params": null
482
+ }