fblgit leaderboard-pr-bot commited on
Commit
b8ac85b
1 Parent(s): f322e92

Adding Evaluation Results (#2)

Browse files

- Adding Evaluation Results (dbc1107d9723204cad879c1ee2a766c718057e25)


Co-authored-by: Open LLM Leaderboard PR Bot <[email protected]>

Files changed (1) hide show
  1. README.md +141 -35
README.md CHANGED
@@ -15,93 +15,185 @@ model-index:
15
  type: text-generation
16
  name: TruthfulQA (MC2)
17
  dataset:
18
- type: text-generation
19
  name: truthful_qa
 
20
  config: multiple_choice
21
  split: validation
22
  metrics:
23
- - type: accuracy
24
- value: 65.13
25
- verified: true
26
  - task:
27
  type: text-generation
28
  name: ARC-Challenge
29
  dataset:
30
- type: text-generation
31
  name: ai2_arc
 
32
  config: ARC-Challenge
33
  split: test
34
  metrics:
35
- - type: accuracy
36
- value: 68.17
37
- verified: true
38
  - task:
39
  type: text-generation
40
  name: HellaSwag
41
  dataset:
42
- type: text-generation
43
  name: Rowan/hellaswag
 
44
  split: test
45
  metrics:
46
- - type: accuracy
47
- value: 85.34
48
- verified: true
 
 
49
  - task:
50
  type: text-generation
51
  name: Winogrande
52
  dataset:
53
- type: text-generation
54
  name: winogrande
 
55
  config: winogrande_debiased
56
  split: test
57
  metrics:
58
- - type: accuracy
59
- value: 78.85
60
- verified: true
61
  - task:
62
  type: text-generation
63
  name: MMLU
64
  dataset:
65
- type: text-generation
66
  name: cais/mmlu
 
67
  config: all
68
  split: test
69
  metrics:
70
- - type: accuracy
71
- value: 62.47
72
- verified: true
73
  - task:
74
  type: text-generation
75
- name: PiQA
76
  dataset:
 
77
  type: text-generation
78
- name: piqa
79
- split: test
80
  metrics:
81
- - type: accuracy
82
- value: 83.57
 
83
  - task:
84
  type: text-generation
85
- name: DROP
86
  dataset:
 
87
  type: text-generation
88
- name: drop
89
  split: validation
90
  metrics:
91
- - type: accuracy
92
- value: 38.74
93
- verified: true
94
  - task:
95
  type: text-generation
96
- name: PubMedQA
97
  dataset:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  type: text-generation
99
- name: bigbio/pubmed_qa
100
- config: pubmed_qa_artificial_bigbio_qa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  split: validation
 
 
102
  metrics:
103
- - type: accuracy
104
- value: 76.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  ---
106
 
107
  # juanako-7b-UNA (Uniform Neural Alignment)
@@ -369,3 +461,17 @@ Thanks to all the brilliant humans behind the creation of AI, here some of the o
369
  archivePrefix={arXiv},
370
  }
371
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  type: text-generation
16
  name: TruthfulQA (MC2)
17
  dataset:
 
18
  name: truthful_qa
19
+ type: text-generation
20
  config: multiple_choice
21
  split: validation
22
  metrics:
23
+ - type: accuracy
24
+ value: 65.13
25
+ verified: true
26
  - task:
27
  type: text-generation
28
  name: ARC-Challenge
29
  dataset:
 
30
  name: ai2_arc
31
+ type: text-generation
32
  config: ARC-Challenge
33
  split: test
34
  metrics:
35
+ - type: accuracy
36
+ value: 68.17
37
+ verified: true
38
  - task:
39
  type: text-generation
40
  name: HellaSwag
41
  dataset:
 
42
  name: Rowan/hellaswag
43
+ type: text-generation
44
  split: test
45
  metrics:
46
+ - type: accuracy
47
+ value: 85.34
48
+ verified: true
49
+ - type: accuracy
50
+ value: 83.57
51
  - task:
52
  type: text-generation
53
  name: Winogrande
54
  dataset:
 
55
  name: winogrande
56
+ type: text-generation
57
  config: winogrande_debiased
58
  split: test
59
  metrics:
60
+ - type: accuracy
61
+ value: 78.85
62
+ verified: true
63
  - task:
64
  type: text-generation
65
  name: MMLU
66
  dataset:
 
67
  name: cais/mmlu
68
+ type: text-generation
69
  config: all
70
  split: test
71
  metrics:
72
+ - type: accuracy
73
+ value: 62.47
74
+ verified: true
75
  - task:
76
  type: text-generation
77
+ name: DROP
78
  dataset:
79
+ name: drop
80
  type: text-generation
81
+ split: validation
 
82
  metrics:
83
+ - type: accuracy
84
+ value: 38.74
85
+ verified: true
86
  - task:
87
  type: text-generation
88
+ name: PubMedQA
89
  dataset:
90
+ name: bigbio/pubmed_qa
91
  type: text-generation
92
+ config: pubmed_qa_artificial_bigbio_qa
93
  split: validation
94
  metrics:
95
+ - type: accuracy
96
+ value: 76.0
 
97
  - task:
98
  type: text-generation
99
+ name: Text Generation
100
  dataset:
101
+ name: AI2 Reasoning Challenge (25-Shot)
102
+ type: ai2_arc
103
+ config: ARC-Challenge
104
+ split: test
105
+ args:
106
+ num_few_shot: 25
107
+ metrics:
108
+ - type: acc_norm
109
+ value: 68.17
110
+ name: normalized accuracy
111
+ source:
112
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=fblgit/juanako-7b-UNA
113
+ name: Open LLM Leaderboard
114
+ - task:
115
  type: text-generation
116
+ name: Text Generation
117
+ dataset:
118
+ name: HellaSwag (10-Shot)
119
+ type: hellaswag
120
+ split: validation
121
+ args:
122
+ num_few_shot: 10
123
+ metrics:
124
+ - type: acc_norm
125
+ value: 85.34
126
+ name: normalized accuracy
127
+ source:
128
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=fblgit/juanako-7b-UNA
129
+ name: Open LLM Leaderboard
130
+ - task:
131
+ type: text-generation
132
+ name: Text Generation
133
+ dataset:
134
+ name: MMLU (5-Shot)
135
+ type: cais/mmlu
136
+ config: all
137
+ split: test
138
+ args:
139
+ num_few_shot: 5
140
+ metrics:
141
+ - type: acc
142
+ value: 62.47
143
+ name: accuracy
144
+ source:
145
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=fblgit/juanako-7b-UNA
146
+ name: Open LLM Leaderboard
147
+ - task:
148
+ type: text-generation
149
+ name: Text Generation
150
+ dataset:
151
+ name: TruthfulQA (0-shot)
152
+ type: truthful_qa
153
+ config: multiple_choice
154
  split: validation
155
+ args:
156
+ num_few_shot: 0
157
  metrics:
158
+ - type: mc2
159
+ value: 65.13
160
+ source:
161
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=fblgit/juanako-7b-UNA
162
+ name: Open LLM Leaderboard
163
+ - task:
164
+ type: text-generation
165
+ name: Text Generation
166
+ dataset:
167
+ name: Winogrande (5-shot)
168
+ type: winogrande
169
+ config: winogrande_xl
170
+ split: validation
171
+ args:
172
+ num_few_shot: 5
173
+ metrics:
174
+ - type: acc
175
+ value: 78.85
176
+ name: accuracy
177
+ source:
178
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=fblgit/juanako-7b-UNA
179
+ name: Open LLM Leaderboard
180
+ - task:
181
+ type: text-generation
182
+ name: Text Generation
183
+ dataset:
184
+ name: GSM8k (5-shot)
185
+ type: gsm8k
186
+ config: main
187
+ split: test
188
+ args:
189
+ num_few_shot: 5
190
+ metrics:
191
+ - type: acc
192
+ value: 44.81
193
+ name: accuracy
194
+ source:
195
+ url: https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard?query=fblgit/juanako-7b-UNA
196
+ name: Open LLM Leaderboard
197
  ---
198
 
199
  # juanako-7b-UNA (Uniform Neural Alignment)
 
461
  archivePrefix={arXiv},
462
  }
463
  ```
464
+
465
+ # [Open LLM Leaderboard Evaluation Results](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)
466
+ Detailed results can be found [here](https://huggingface.co/datasets/open-llm-leaderboard/details_fblgit__juanako-7b-UNA)
467
+
468
+ | Metric |Value|
469
+ |---------------------------------|----:|
470
+ |Avg. |67.46|
471
+ |AI2 Reasoning Challenge (25-Shot)|68.17|
472
+ |HellaSwag (10-Shot) |85.34|
473
+ |MMLU (5-Shot) |62.47|
474
+ |TruthfulQA (0-shot) |65.13|
475
+ |Winogrande (5-shot) |78.85|
476
+ |GSM8k (5-shot) |44.81|
477
+