yuchenlin commited on
Commit
983bc41
β€’
1 Parent(s): 3c29637

update leaderboards

Browse files
ZeroEval-main/result_dirs/zebra-grid.summary.json CHANGED
@@ -54,6 +54,17 @@
54
  "Total Puzzles": 1000,
55
  "Reason Lens": "1549.74"
56
  },
 
 
 
 
 
 
 
 
 
 
 
57
  {
58
  "Model": "gpt-4-turbo-2024-04-09",
59
  "Mode": "greedy",
@@ -109,6 +120,17 @@
109
  "Total Puzzles": 1000,
110
  "Reason Lens": "1165.90"
111
  },
 
 
 
 
 
 
 
 
 
 
 
112
  {
113
  "Model": "deepseek-chat",
114
  "Mode": "greedy",
@@ -142,6 +164,17 @@
142
  "Total Puzzles": 1000,
143
  "Reason Lens": "1324.55"
144
  },
 
 
 
 
 
 
 
 
 
 
 
145
  {
146
  "Model": "gpt-4o-mini-2024-07-18",
147
  "Mode": "greedy",
@@ -307,6 +340,17 @@
307
  "Total Puzzles": 1000,
308
  "Reason Lens": "1078.29"
309
  },
 
 
 
 
 
 
 
 
 
 
 
310
  {
311
  "Model": "gemma-2-9b-it@nvidia",
312
  "Mode": "greedy",
@@ -439,6 +483,17 @@
439
  "Total Puzzles": 1000,
440
  "Reason Lens": "1473.23"
441
  },
 
 
 
 
 
 
 
 
 
 
 
442
  {
443
  "Model": "Yi-1.5-9B-Chat",
444
  "Mode": "greedy",
 
54
  "Total Puzzles": 1000,
55
  "Reason Lens": "1549.74"
56
  },
57
+ {
58
+ "Model": "Mistral-Large-2",
59
+ "Mode": "greedy",
60
+ "Puzzle Acc": "29.00",
61
+ "Cell Acc": "47.64",
62
+ "No answer": "1.70",
63
+ "Easy Puzzle Acc": "80.36",
64
+ "Hard Puzzle Acc": "9.03",
65
+ "Total Puzzles": 1000,
66
+ "Reason Lens": "1592.39"
67
+ },
68
  {
69
  "Model": "gpt-4-turbo-2024-04-09",
70
  "Mode": "greedy",
 
120
  "Total Puzzles": 1000,
121
  "Reason Lens": "1165.90"
122
  },
123
+ {
124
+ "Model": "Meta-Llama-3.1-70B-Instruct",
125
+ "Mode": "greedy",
126
+ "Puzzle Acc": "24.90",
127
+ "Cell Acc": "27.98",
128
+ "No answer": "43.00",
129
+ "Easy Puzzle Acc": "73.57",
130
+ "Hard Puzzle Acc": "5.97",
131
+ "Total Puzzles": 1000,
132
+ "Reason Lens": "1483.68"
133
+ },
134
  {
135
  "Model": "deepseek-chat",
136
  "Mode": "greedy",
 
164
  "Total Puzzles": 1000,
165
  "Reason Lens": "1324.55"
166
  },
167
+ {
168
+ "Model": "DeepSeek-Coder-V2-0724",
169
+ "Mode": "greedy",
170
+ "Puzzle Acc": "20.50",
171
+ "Cell Acc": "42.35",
172
+ "No answer": "3.40",
173
+ "Easy Puzzle Acc": "61.79",
174
+ "Hard Puzzle Acc": "4.44",
175
+ "Total Puzzles": 1000,
176
+ "Reason Lens": "1230.63"
177
+ },
178
  {
179
  "Model": "gpt-4o-mini-2024-07-18",
180
  "Mode": "greedy",
 
340
  "Total Puzzles": 1000,
341
  "Reason Lens": "1078.29"
342
  },
343
+ {
344
+ "Model": "Meta-Llama-3.1-8B-Instruct",
345
+ "Mode": "greedy",
346
+ "Puzzle Acc": "12.80",
347
+ "Cell Acc": "13.68",
348
+ "No answer": "61.50",
349
+ "Easy Puzzle Acc": "43.57",
350
+ "Hard Puzzle Acc": "0.83",
351
+ "Total Puzzles": 1000,
352
+ "Reason Lens": "1043.90"
353
+ },
354
  {
355
  "Model": "gemma-2-9b-it@nvidia",
356
  "Mode": "greedy",
 
483
  "Total Puzzles": 1000,
484
  "Reason Lens": "1473.23"
485
  },
486
+ {
487
+ "Model": "gemma-2-2b-it",
488
+ "Mode": "greedy",
489
+ "Puzzle Acc": "4.20",
490
+ "Cell Acc": "9.97",
491
+ "No answer": "57.20",
492
+ "Easy Puzzle Acc": "14.29",
493
+ "Hard Puzzle Acc": "0.28",
494
+ "Total Puzzles": 1000,
495
+ "Reason Lens": "1032.89"
496
+ },
497
  {
498
  "Model": "Yi-1.5-9B-Chat",
499
  "Mode": "greedy",
model_info.json CHANGED
@@ -68,5 +68,10 @@
68
  "Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo 🚨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
69
  "Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) 🚨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
70
  "Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct 🚨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
71
- "Athene-70B": {"pretty_name": "Athene-70B 🚨", "hf_model_id": "Nexusflow/Athene-70B"}
 
 
 
 
 
72
  }
 
68
  "Llama-3.1-405B-Instruct-Turbo": {"pretty_name": "Llama-3.1-405B-Instruct-Turbo 🚨", "hf_model_id": "meta-llama/Meta-Llama-3.1-405B-Instruct"},
69
  "Mistral-Nemo-Instruct-2407": {"pretty_name": "Mistral-Nemo-Inst (12B) 🚨", "hf_model_id": "Mistral-Nemo-Instruct-2407"},
70
  "Phi-3-mini-4k-instruct": {"pretty_name": "Phi-3-mini-4k-instruct 🚨", "hf_model_id": "microsoft/Phi-3-mini-4k-instruct"},
71
+ "Athene-70B": {"pretty_name": "Athene-70B 🚨", "hf_model_id": "Nexusflow/Athene-70B"},
72
+ "Mistral-Large-2": {"pretty_name": "Mistral-Large 2", "hf_model_id": "mistralai/Mistral-Large-Instruct-2407"},
73
+ "Meta-Llama-3.1-8B-Instruct": {"pretty_name": "Llama-3.1-8B-Instruct", "hf_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct"},
74
+ "Meta-Llama-3.1-70B-Instruct": {"pretty_name": "Llama-3.1-70B-Instruct", "hf_model_id": "meta-llama/Meta-Llama-3.1-70B-Instruct"},
75
+ "gemma-2-2b-it" : {"pretty_name": "Gemma-2-2B-it", "hf_model_id": "google/gemma-2-2b-it"},
76
+ "DeepSeek-Coder-V2-0724": {"pretty_name": "DeepSeek-Coder-V2-0724", "hf_model_id": "deepseek-ai/DeepSeek-Coder-V2-Instruct"}
77
  }