Update README.md
Browse files
README.md
CHANGED
@@ -35,7 +35,123 @@ parameters:
|
|
35 |
value: [1, 0.5, 0.7, 0.3, 0]
|
36 |
- value: 0.5
|
37 |
```
|
|
|
|
|
|
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
## 💻 Usage
|
40 |
|
41 |
```python
|
|
|
35 |
value: [1, 0.5, 0.7, 0.3, 0]
|
36 |
- value: 0.5
|
37 |
```
|
38 |
+
| Model | ARC |HellaSwag| MMLU |TruthfulQA|Winogrande|GSM8K|
|
39 |
+
|---------------------------------------------------------------|----:|--------:|--------------------------|---------:|---------:|----:|
|
40 |
+
|[Sappho_V0.0.4](https://huggingface.co/Jakolo121/Sappho_V0.0.4)|63.65| 84.1|Error: File does not exist| 52.99| 77.66|55.27|
|
41 |
|
42 |
+
### ARC
|
43 |
+
| Task |Version| Metric | Value | |Stderr|
|
44 |
+
|-------------|------:|--------------------|-------------|---|------|
|
45 |
+
|arc_challenge| 1|acc,none | 0.61| | |
|
46 |
+
| | |acc_stderr,none | 0.01| | |
|
47 |
+
| | |acc_norm,none | 0.64| | |
|
48 |
+
| | |acc_norm_stderr,none| 0.01| | |
|
49 |
+
| | |alias |arc_challenge| | |
|
50 |
+
|
51 |
+
Average: 63.65%
|
52 |
+
|
53 |
+
### HellaSwag
|
54 |
+
| Task |Version| Metric | Value | |Stderr|
|
55 |
+
|---------|------:|--------------------|---------|---|------|
|
56 |
+
|hellaswag| 1|acc,none | 0.66| | |
|
57 |
+
| | |acc_stderr,none | 0| | |
|
58 |
+
| | |acc_norm,none | 0.84| | |
|
59 |
+
| | |acc_norm_stderr,none| 0| | |
|
60 |
+
| | |alias |hellaswag| | |
|
61 |
+
|
62 |
+
Average: 84.1%
|
63 |
+
|
64 |
+
### MMLU
|
65 |
+
|
66 |
+
Average: Error: File does not exist%
|
67 |
+
|
68 |
+
### TruthfulQA
|
69 |
+
| Task |Version| Metric | Value | |Stderr|
|
70 |
+
|--------------|-------|-----------------------|-----------------|---|------|
|
71 |
+
|truthfulqa |N/A |rouge2_max,none | 36.50| | |
|
72 |
+
| | |rouge2_max_stderr,none | 1.02| | |
|
73 |
+
| | |rouge1_max,none | 50.18| | |
|
74 |
+
| | |rouge1_max_stderr,none | 0.88| | |
|
75 |
+
| | |rouge1_acc,none | 0.52| | |
|
76 |
+
| | |rouge1_acc_stderr,none | 0.02| | |
|
77 |
+
| | |bleu_max,none | 25.40| | |
|
78 |
+
| | |bleu_max_stderr,none | 0.81| | |
|
79 |
+
| | |rouge2_acc,none | 0.45| | |
|
80 |
+
| | |rouge2_acc_stderr,none | 0.02| | |
|
81 |
+
| | |rouge2_diff,none | 5.12| | |
|
82 |
+
| | |rouge2_diff_stderr,none| 1.14| | |
|
83 |
+
| | |acc,none | 0.45| | |
|
84 |
+
| | |acc_stderr,none | 0.01| | |
|
85 |
+
| | |bleu_acc,none | 0.52| | |
|
86 |
+
| | |bleu_acc_stderr,none | 0.02| | |
|
87 |
+
| | |rouge1_diff,none | 4.67| | |
|
88 |
+
| | |rouge1_diff_stderr,none| 1.08| | |
|
89 |
+
| | |rougeL_diff,none | 3.92| | |
|
90 |
+
| | |rougeL_diff_stderr,none| 1.09| | |
|
91 |
+
| | |bleu_diff,none | 4| | |
|
92 |
+
| | |bleu_diff_stderr,none | 0.79| | |
|
93 |
+
| | |rougeL_acc,none | 0.50| | |
|
94 |
+
| | |rougeL_acc_stderr,none | 0.02| | |
|
95 |
+
| | |rougeL_max,none | 46.87| | |
|
96 |
+
| | |rougeL_max_stderr,none | 0.91| | |
|
97 |
+
| | |alias |truthfulqa | | |
|
98 |
+
|truthfulqa_gen| 3|bleu_max,none | 25.40| | |
|
99 |
+
| | |bleu_max_stderr,none | 0.81| | |
|
100 |
+
| | |bleu_acc,none | 0.52| | |
|
101 |
+
| | |bleu_acc_stderr,none | 0.02| | |
|
102 |
+
| | |bleu_diff,none | 4| | |
|
103 |
+
| | |bleu_diff_stderr,none | 0.79| | |
|
104 |
+
| | |rouge1_max,none | 50.18| | |
|
105 |
+
| | |rouge1_max_stderr,none | 0.88| | |
|
106 |
+
| | |rouge1_acc,none | 0.52| | |
|
107 |
+
| | |rouge1_acc_stderr,none | 0.02| | |
|
108 |
+
| | |rouge1_diff,none | 4.67| | |
|
109 |
+
| | |rouge1_diff_stderr,none| 1.08| | |
|
110 |
+
| | |rouge2_max,none | 36.50| | |
|
111 |
+
| | |rouge2_max_stderr,none | 1.02| | |
|
112 |
+
| | |rouge2_acc,none | 0.45| | |
|
113 |
+
| | |rouge2_acc_stderr,none | 0.02| | |
|
114 |
+
| | |rouge2_diff,none | 5.12| | |
|
115 |
+
| | |rouge2_diff_stderr,none| 1.14| | |
|
116 |
+
| | |rougeL_max,none | 46.87| | |
|
117 |
+
| | |rougeL_max_stderr,none | 0.91| | |
|
118 |
+
| | |rougeL_acc,none | 0.50| | |
|
119 |
+
| | |rougeL_acc_stderr,none | 0.02| | |
|
120 |
+
| | |rougeL_diff,none | 3.92| | |
|
121 |
+
| | |rougeL_diff_stderr,none| 1.09| | |
|
122 |
+
| | |alias | - truthfulqa_gen| | |
|
123 |
+
|truthfulqa_mc1| 2|acc,none | 0.37| | |
|
124 |
+
| | |acc_stderr,none | 0.02| | |
|
125 |
+
| | |alias | - truthfulqa_mc1| | |
|
126 |
+
|truthfulqa_mc2| 2|acc,none | 0.53| | |
|
127 |
+
| | |acc_stderr,none | 0.02| | |
|
128 |
+
| | |alias | - truthfulqa_mc2| | |
|
129 |
+
|
130 |
+
Average: 52.99%
|
131 |
+
|
132 |
+
### Winogrande
|
133 |
+
| Task |Version| Metric | Value | |Stderr|
|
134 |
+
|----------|------:|---------------|----------|---|------|
|
135 |
+
|winogrande| 1|acc,none | 0.78| | |
|
136 |
+
| | |acc_stderr,none| 0.01| | |
|
137 |
+
| | |alias |winogrande| | |
|
138 |
+
|
139 |
+
Average: 77.66%
|
140 |
+
|
141 |
+
### GSM8K
|
142 |
+
|Task |Version| Metric |Value| |Stderr|
|
143 |
+
|-----|------:|-----------------------------------|-----|---|------|
|
144 |
+
|gsm8k| 3|exact_match,strict-match | 0.55| | |
|
145 |
+
| | |exact_match_stderr,strict-match | 0.01| | |
|
146 |
+
| | |exact_match,flexible-extract | 0.56| | |
|
147 |
+
| | |exact_match_stderr,flexible-extract| 0.01| | |
|
148 |
+
| | |alias |gsm8k| | |
|
149 |
+
|
150 |
+
Average: 55.27%
|
151 |
+
|
152 |
+
Average score: Not available due to errors
|
153 |
+
|
154 |
+
Elapsed time: 06:08:53
|
155 |
## 💻 Usage
|
156 |
|
157 |
```python
|