Isaak Carter Augustus commited on
Commit
45fa446
1 Parent(s): 7efb3b0

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +385 -0
README.md CHANGED
@@ -38,6 +38,391 @@ parameters:
38
  dtype: bfloat16
39
  ```
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  ## 💻 Usage
42
 
43
  ```python
 
38
  dtype: bfloat16
39
  ```
40
 
41
+ ## Evaluation
42
+
43
+ ```json
44
+ {
45
+ "all": {
46
+ "acc": 0.6403971587643947,
47
+ "acc_stderr": 0.03228725576276735,
48
+ "acc_norm": 0.6413927640714372,
49
+ "acc_norm_stderr": 0.03294011331780708,
50
+ "mc1": 0.39167686658506734,
51
+ "mc1_stderr": 0.017087795881769622,
52
+ "mc2": 0.5576866593959974,
53
+ "mc2_stderr": 0.01554622060467735
54
+ },
55
+ "harness|arc:challenge|25": {
56
+ "acc": 0.6186006825938567,
57
+ "acc_stderr": 0.014194389086685244,
58
+ "acc_norm": 0.6450511945392492,
59
+ "acc_norm_stderr": 0.013983036904094087
60
+ },
61
+ "harness|hellaswag|10": {
62
+ "acc": 0.6738697470623382,
63
+ "acc_stderr": 0.004678375103797962,
64
+ "acc_norm": 0.8499302927703645,
65
+ "acc_norm_stderr": 0.003564098420387764
66
+ },
67
+ "harness|hendrycksTest-abstract_algebra|5": {
68
+ "acc": 0.28,
69
+ "acc_stderr": 0.04512608598542128,
70
+ "acc_norm": 0.28,
71
+ "acc_norm_stderr": 0.04512608598542128
72
+ },
73
+ "harness|hendrycksTest-anatomy|5": {
74
+ "acc": 0.5777777777777777,
75
+ "acc_stderr": 0.04266763404099582,
76
+ "acc_norm": 0.5777777777777777,
77
+ "acc_norm_stderr": 0.04266763404099582
78
+ },
79
+ "harness|hendrycksTest-astronomy|5": {
80
+ "acc": 0.6907894736842105,
81
+ "acc_stderr": 0.037610708698674805,
82
+ "acc_norm": 0.6907894736842105,
83
+ "acc_norm_stderr": 0.037610708698674805
84
+ },
85
+ "harness|hendrycksTest-business_ethics|5": {
86
+ "acc": 0.63,
87
+ "acc_stderr": 0.04852365870939099,
88
+ "acc_norm": 0.63,
89
+ "acc_norm_stderr": 0.04852365870939099
90
+ },
91
+ "harness|hendrycksTest-clinical_knowledge|5": {
92
+ "acc": 0.6830188679245283,
93
+ "acc_stderr": 0.02863723563980089,
94
+ "acc_norm": 0.6830188679245283,
95
+ "acc_norm_stderr": 0.02863723563980089
96
+ },
97
+ "harness|hendrycksTest-college_biology|5": {
98
+ "acc": 0.7708333333333334,
99
+ "acc_stderr": 0.03514697467862388,
100
+ "acc_norm": 0.7708333333333334,
101
+ "acc_norm_stderr": 0.03514697467862388
102
+ },
103
+ "harness|hendrycksTest-college_chemistry|5": {
104
+ "acc": 0.48,
105
+ "acc_stderr": 0.050211673156867795,
106
+ "acc_norm": 0.48,
107
+ "acc_norm_stderr": 0.050211673156867795
108
+ },
109
+ "harness|hendrycksTest-college_computer_science|5": {
110
+ "acc": 0.51,
111
+ "acc_stderr": 0.05024183937956912,
112
+ "acc_norm": 0.51,
113
+ "acc_norm_stderr": 0.05024183937956912
114
+ },
115
+ "harness|hendrycksTest-college_mathematics|5": {
116
+ "acc": 0.38,
117
+ "acc_stderr": 0.04878317312145634,
118
+ "acc_norm": 0.38,
119
+ "acc_norm_stderr": 0.04878317312145634
120
+ },
121
+ "harness|hendrycksTest-college_medicine|5": {
122
+ "acc": 0.6358381502890174,
123
+ "acc_stderr": 0.03669072477416907,
124
+ "acc_norm": 0.6358381502890174,
125
+ "acc_norm_stderr": 0.03669072477416907
126
+ },
127
+ "harness|hendrycksTest-college_physics|5": {
128
+ "acc": 0.4019607843137255,
129
+ "acc_stderr": 0.048786087144669955,
130
+ "acc_norm": 0.4019607843137255,
131
+ "acc_norm_stderr": 0.048786087144669955
132
+ },
133
+ "harness|hendrycksTest-computer_security|5": {
134
+ "acc": 0.74,
135
+ "acc_stderr": 0.0440844002276808,
136
+ "acc_norm": 0.74,
137
+ "acc_norm_stderr": 0.0440844002276808
138
+ },
139
+ "harness|hendrycksTest-conceptual_physics|5": {
140
+ "acc": 0.548936170212766,
141
+ "acc_stderr": 0.032529096196131965,
142
+ "acc_norm": 0.548936170212766,
143
+ "acc_norm_stderr": 0.032529096196131965
144
+ },
145
+ "harness|hendrycksTest-econometrics|5": {
146
+ "acc": 0.4649122807017544,
147
+ "acc_stderr": 0.046920083813689104,
148
+ "acc_norm": 0.4649122807017544,
149
+ "acc_norm_stderr": 0.046920083813689104
150
+ },
151
+ "harness|hendrycksTest-electrical_engineering|5": {
152
+ "acc": 0.5172413793103449,
153
+ "acc_stderr": 0.04164188720169375,
154
+ "acc_norm": 0.5172413793103449,
155
+ "acc_norm_stderr": 0.04164188720169375
156
+ },
157
+ "harness|hendrycksTest-elementary_mathematics|5": {
158
+ "acc": 0.41534391534391535,
159
+ "acc_stderr": 0.025379524910778398,
160
+ "acc_norm": 0.41534391534391535,
161
+ "acc_norm_stderr": 0.025379524910778398
162
+ },
163
+ "harness|hendrycksTest-formal_logic|5": {
164
+ "acc": 0.4444444444444444,
165
+ "acc_stderr": 0.044444444444444495,
166
+ "acc_norm": 0.4444444444444444,
167
+ "acc_norm_stderr": 0.044444444444444495
168
+ },
169
+ "harness|hendrycksTest-global_facts|5": {
170
+ "acc": 0.33,
171
+ "acc_stderr": 0.047258156262526045,
172
+ "acc_norm": 0.33,
173
+ "acc_norm_stderr": 0.047258156262526045
174
+ },
175
+ "harness|hendrycksTest-high_school_biology|5": {
176
+ "acc": 0.7741935483870968,
177
+ "acc_stderr": 0.023785577884181012,
178
+ "acc_norm": 0.7741935483870968,
179
+ "acc_norm_stderr": 0.023785577884181012
180
+ },
181
+ "harness|hendrycksTest-high_school_chemistry|5": {
182
+ "acc": 0.4729064039408867,
183
+ "acc_stderr": 0.03512819077876106,
184
+ "acc_norm": 0.4729064039408867,
185
+ "acc_norm_stderr": 0.03512819077876106
186
+ },
187
+ "harness|hendrycksTest-high_school_computer_science|5": {
188
+ "acc": 0.68,
189
+ "acc_stderr": 0.04688261722621505,
190
+ "acc_norm": 0.68,
191
+ "acc_norm_stderr": 0.04688261722621505
192
+ },
193
+ "harness|hendrycksTest-high_school_european_history|5": {
194
+ "acc": 0.793939393939394,
195
+ "acc_stderr": 0.0315841532404771,
196
+ "acc_norm": 0.793939393939394,
197
+ "acc_norm_stderr": 0.0315841532404771
198
+ },
199
+ "harness|hendrycksTest-high_school_geography|5": {
200
+ "acc": 0.8181818181818182,
201
+ "acc_stderr": 0.027479603010538804,
202
+ "acc_norm": 0.8181818181818182,
203
+ "acc_norm_stderr": 0.027479603010538804
204
+ },
205
+ "harness|hendrycksTest-high_school_government_and_politics|5": {
206
+ "acc": 0.9067357512953368,
207
+ "acc_stderr": 0.02098685459328972,
208
+ "acc_norm": 0.9067357512953368,
209
+ "acc_norm_stderr": 0.02098685459328972
210
+ },
211
+ "harness|hendrycksTest-high_school_macroeconomics|5": {
212
+ "acc": 0.658974358974359,
213
+ "acc_stderr": 0.024035489676335082,
214
+ "acc_norm": 0.658974358974359,
215
+ "acc_norm_stderr": 0.024035489676335082
216
+ },
217
+ "harness|hendrycksTest-high_school_mathematics|5": {
218
+ "acc": 0.32592592592592595,
219
+ "acc_stderr": 0.028578348365473072,
220
+ "acc_norm": 0.32592592592592595,
221
+ "acc_norm_stderr": 0.028578348365473072
222
+ },
223
+ "harness|hendrycksTest-high_school_microeconomics|5": {
224
+ "acc": 0.680672268907563,
225
+ "acc_stderr": 0.030283995525884396,
226
+ "acc_norm": 0.680672268907563,
227
+ "acc_norm_stderr": 0.030283995525884396
228
+ },
229
+ "harness|hendrycksTest-high_school_physics|5": {
230
+ "acc": 0.3576158940397351,
231
+ "acc_stderr": 0.03913453431177258,
232
+ "acc_norm": 0.3576158940397351,
233
+ "acc_norm_stderr": 0.03913453431177258
234
+ },
235
+ "harness|hendrycksTest-high_school_psychology|5": {
236
+ "acc": 0.8348623853211009,
237
+ "acc_stderr": 0.015919557829976037,
238
+ "acc_norm": 0.8348623853211009,
239
+ "acc_norm_stderr": 0.015919557829976037
240
+ },
241
+ "harness|hendrycksTest-high_school_statistics|5": {
242
+ "acc": 0.5138888888888888,
243
+ "acc_stderr": 0.03408655867977749,
244
+ "acc_norm": 0.5138888888888888,
245
+ "acc_norm_stderr": 0.03408655867977749
246
+ },
247
+ "harness|hendrycksTest-high_school_us_history|5": {
248
+ "acc": 0.7892156862745098,
249
+ "acc_stderr": 0.028626547912437413,
250
+ "acc_norm": 0.7892156862745098,
251
+ "acc_norm_stderr": 0.028626547912437413
252
+ },
253
+ "harness|hendrycksTest-high_school_world_history|5": {
254
+ "acc": 0.8143459915611815,
255
+ "acc_stderr": 0.02531049537694486,
256
+ "acc_norm": 0.8143459915611815,
257
+ "acc_norm_stderr": 0.02531049537694486
258
+ },
259
+ "harness|hendrycksTest-human_aging|5": {
260
+ "acc": 0.6995515695067265,
261
+ "acc_stderr": 0.030769352008229146,
262
+ "acc_norm": 0.6995515695067265,
263
+ "acc_norm_stderr": 0.030769352008229146
264
+ },
265
+ "harness|hendrycksTest-human_sexuality|5": {
266
+ "acc": 0.7938931297709924,
267
+ "acc_stderr": 0.03547771004159465,
268
+ "acc_norm": 0.7938931297709924,
269
+ "acc_norm_stderr": 0.03547771004159465
270
+ },
271
+ "harness|hendrycksTest-international_law|5": {
272
+ "acc": 0.768595041322314,
273
+ "acc_stderr": 0.03849856098794088,
274
+ "acc_norm": 0.768595041322314,
275
+ "acc_norm_stderr": 0.03849856098794088
276
+ },
277
+ "harness|hendrycksTest-jurisprudence|5": {
278
+ "acc": 0.7870370370370371,
279
+ "acc_stderr": 0.0395783547198098,
280
+ "acc_norm": 0.7870370370370371,
281
+ "acc_norm_stderr": 0.0395783547198098
282
+ },
283
+ "harness|hendrycksTest-logical_fallacies|5": {
284
+ "acc": 0.7668711656441718,
285
+ "acc_stderr": 0.0332201579577674,
286
+ "acc_norm": 0.7668711656441718,
287
+ "acc_norm_stderr": 0.0332201579577674
288
+ },
289
+ "harness|hendrycksTest-machine_learning|5": {
290
+ "acc": 0.5178571428571429,
291
+ "acc_stderr": 0.047427623612430116,
292
+ "acc_norm": 0.5178571428571429,
293
+ "acc_norm_stderr": 0.047427623612430116
294
+ },
295
+ "harness|hendrycksTest-management|5": {
296
+ "acc": 0.7864077669902912,
297
+ "acc_stderr": 0.040580420156460344,
298
+ "acc_norm": 0.7864077669902912,
299
+ "acc_norm_stderr": 0.040580420156460344
300
+ },
301
+ "harness|hendrycksTest-marketing|5": {
302
+ "acc": 0.8675213675213675,
303
+ "acc_stderr": 0.022209309073165616,
304
+ "acc_norm": 0.8675213675213675,
305
+ "acc_norm_stderr": 0.022209309073165616
306
+ },
307
+ "harness|hendrycksTest-medical_genetics|5": {
308
+ "acc": 0.69,
309
+ "acc_stderr": 0.04648231987117316,
310
+ "acc_norm": 0.69,
311
+ "acc_norm_stderr": 0.04648231987117316
312
+ },
313
+ "harness|hendrycksTest-miscellaneous|5": {
314
+ "acc": 0.8007662835249042,
315
+ "acc_stderr": 0.014283378044296418,
316
+ "acc_norm": 0.8007662835249042,
317
+ "acc_norm_stderr": 0.014283378044296418
318
+ },
319
+ "harness|hendrycksTest-moral_disputes|5": {
320
+ "acc": 0.7225433526011561,
321
+ "acc_stderr": 0.024105712607754307,
322
+ "acc_norm": 0.7225433526011561,
323
+ "acc_norm_stderr": 0.024105712607754307
324
+ },
325
+ "harness|hendrycksTest-moral_scenarios|5": {
326
+ "acc": 0.30726256983240224,
327
+ "acc_stderr": 0.01543015884646961,
328
+ "acc_norm": 0.30726256983240224,
329
+ "acc_norm_stderr": 0.01543015884646961
330
+ },
331
+ "harness|hendrycksTest-nutrition|5": {
332
+ "acc": 0.7320261437908496,
333
+ "acc_stderr": 0.025360603796242557,
334
+ "acc_norm": 0.7320261437908496,
335
+ "acc_norm_stderr": 0.025360603796242557
336
+ },
337
+ "harness|hendrycksTest-philosophy|5": {
338
+ "acc": 0.7138263665594855,
339
+ "acc_stderr": 0.025670259242188933,
340
+ "acc_norm": 0.7138263665594855,
341
+ "acc_norm_stderr": 0.025670259242188933
342
+ },
343
+ "harness|hendrycksTest-prehistory|5": {
344
+ "acc": 0.7283950617283951,
345
+ "acc_stderr": 0.024748624490537368,
346
+ "acc_norm": 0.7283950617283951,
347
+ "acc_norm_stderr": 0.024748624490537368
348
+ },
349
+ "harness|hendrycksTest-professional_accounting|5": {
350
+ "acc": 0.48936170212765956,
351
+ "acc_stderr": 0.029820747191422473,
352
+ "acc_norm": 0.48936170212765956,
353
+ "acc_norm_stderr": 0.029820747191422473
354
+ },
355
+ "harness|hendrycksTest-professional_law|5": {
356
+ "acc": 0.4706649282920469,
357
+ "acc_stderr": 0.01274823839736555,
358
+ "acc_norm": 0.4706649282920469,
359
+ "acc_norm_stderr": 0.01274823839736555
360
+ },
361
+ "harness|hendrycksTest-professional_medicine|5": {
362
+ "acc": 0.6764705882352942,
363
+ "acc_stderr": 0.028418208619406762,
364
+ "acc_norm": 0.6764705882352942,
365
+ "acc_norm_stderr": 0.028418208619406762
366
+ },
367
+ "harness|hendrycksTest-professional_psychology|5": {
368
+ "acc": 0.6764705882352942,
369
+ "acc_stderr": 0.018926082916083376,
370
+ "acc_norm": 0.6764705882352942,
371
+ "acc_norm_stderr": 0.018926082916083376
372
+ },
373
+ "harness|hendrycksTest-public_relations|5": {
374
+ "acc": 0.6636363636363637,
375
+ "acc_stderr": 0.04525393596302506,
376
+ "acc_norm": 0.6636363636363637,
377
+ "acc_norm_stderr": 0.04525393596302506
378
+ },
379
+ "harness|hendrycksTest-security_studies|5": {
380
+ "acc": 0.7387755102040816,
381
+ "acc_stderr": 0.02812342933514278,
382
+ "acc_norm": 0.7387755102040816,
383
+ "acc_norm_stderr": 0.02812342933514278
384
+ },
385
+ "harness|hendrycksTest-sociology|5": {
386
+ "acc": 0.835820895522388,
387
+ "acc_stderr": 0.02619392354445412,
388
+ "acc_norm": 0.835820895522388,
389
+ "acc_norm_stderr": 0.02619392354445412
390
+ },
391
+ "harness|hendrycksTest-us_foreign_policy|5": {
392
+ "acc": 0.83,
393
+ "acc_stderr": 0.0377525168068637,
394
+ "acc_norm": 0.83,
395
+ "acc_norm_stderr": 0.0377525168068637
396
+ },
397
+ "harness|hendrycksTest-virology|5": {
398
+ "acc": 0.5421686746987951,
399
+ "acc_stderr": 0.0387862677100236,
400
+ "acc_norm": 0.5421686746987951,
401
+ "acc_norm_stderr": 0.0387862677100236
402
+ },
403
+ "harness|hendrycksTest-world_religions|5": {
404
+ "acc": 0.8187134502923976,
405
+ "acc_stderr": 0.029547741687640038,
406
+ "acc_norm": 0.8187134502923976,
407
+ "acc_norm_stderr": 0.029547741687640038
408
+ },
409
+ "harness|truthfulqa:mc|0": {
410
+ "mc1": 0.39167686658506734,
411
+ "mc1_stderr": 0.017087795881769622,
412
+ "mc2": 0.5576866593959974,
413
+ "mc2_stderr": 0.01554622060467735
414
+ },
415
+ "harness|winogrande|5": {
416
+ "acc": 0.7884767166535123,
417
+ "acc_stderr": 0.011477747684223188
418
+ },
419
+ "harness|gsm8k|5": {
420
+ "acc": 0.6360879454131918,
421
+ "acc_stderr": 0.013252539227966195
422
+ }
423
+ }
424
+ ```
425
+
426
  ## 💻 Usage
427
 
428
  ```python