HF_name,subset,task_by_convention,format,comment,seed_paper,september_check,do_train,do_eval,train_size,adjusted_train_size,D3_do_train,D3_do_eval,D3_adjusted_train_size,metric,multiple correct answer,Paper link,non_linguistic_knowledge,skip,Imported Task Name,imported category,input_length,_human_skill,Domain,Reference
crows_pairs,,bias_and_fairness,,test set only; authors themselves acknowledge some problems,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
jigsaw_toxicity_pred,,bias_and_fairness,,current https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data ; want https://www.kaggle.com/c/jigsaw-unintended-bias-in-toxicity-classification,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
super_glue,axg,bias_and_fairness,cls,test set only,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
winogender,,bias_and_fairness,cls,also as axg in super_glue,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
wino_bias,type1_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
wino_bias,type2_anti,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
wino_bias,type1_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
wino_bias,type2_pro,bias_and_fairness,cls,,Eval WG,,,TRUE,,,,,,,,,,,,,,,,
super_glue,wsc.fixed,coreference,cls,,,,,TRUE,554,0,TRUE,TRUE,554,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-wsc,cls/other,single sentence,knowledge-? reading comprehension,,Levesque et al. 2012
winograd_wsc,wsc273,coreference,ext,,GPT,,,TRUE,0,0,,,0,accuracy,,https://www.aaai.org/ocs/index.php/KR/KR12/paper/download/4492/4924,,,,,,,,Levesque et al. 2012
winogrande,winogrande_xl,coreference,ext,,GPT,TRUE,,TRUE,40398,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
winogrande,winogrande_debiased,coreference,ext,"""debiased"" = adversarially filtered",GPT,TRUE,,TRUE,9248,0,,,0,accuracy,,https://arxiv.org/pdf/1907.10641.pdf,,,WinoGrande,qa/multiple-choice qa,,knowledge-? reading comprehension,,Sakaguchi et al. 2020
glue,cola,grammatical_acceptability,cls,includes semantic acceptability too; to be replaced by blimp,,,,TRUE,8551,0,,TRUE,0,accuracy;matthews_corrcoef,,https://arxiv.org/pdf/1805.12471.pdf,,,glue-cola,cls/other,single sentence,,,Warstadt et al. 2019
super_glue,cb,NLI,cls,"""for multi-class F1 we compute the unweighted average of the F1 per class.""",,TRUE,,TRUE,250,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://semanticsarchive.net/Archive/Tg3ZGI2M/Marneffe.pdf,,,superglue-cb,cls/nli,sentence pair,knowledge-neutral inference,,de Marneffe et al. 2019
super_glue,rte,NLI,cls,,,TRUE,,TRUE,2490,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1905.00537.pdf,,,superglue-rte,cls/nli,sentence pair,knowledge modest inference,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
anli,,NLI,cls,"In addition to accuracy, paper also evaluates on range of relaxed/strict and matched/unmatched settings and reports F scores for different answers",,,,TRUE,162865,0,,TRUE,0,accuracy,,https://arxiv.org/abs/1910.14599,,,anli,cls/nli,sentence pair,knowledge modest inference,,Nie et al. 2020
hans,,NLI,cls,,,TRUE,,TRUE,0,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1902.01007.pdf,,,,,sentence pair,syntax?,,McCoy et al. 2019
super_glue,axb,NLI,cls,test set only,,TRUE,,TRUE,0,0,,,,,,,,,,,,,,
glue,mrpc,paraphrase,cls,,,,TRUE,TRUE,3668,3668,TRUE,TRUE,3668,accuracy;f1_score,,https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/I05-50025B15D.pdf,,,glue-mrpc,cls/paraphrase,,paraphrase,,Dolan and Brockett 2005
glue,qqp,paraphrase,cls,,,,TRUE,TRUE,363846,363846,TRUE,,363846,accuracy;f1_score,,https://aclanthology.org/I05-5002.pdf,,,glue-qqp,cls/paraphrase,,,,(link)
paws,labeled_final,paraphrase,cls,,,,TRUE,,49401,49401,TRUE,,49401,,,,,,paws,cls/paraphrase,,,,Zhang et al. 2019
ai2_arc,ARC-Challenge,QA_closed_book,cls,,GPT,,,TRUE,1119,0,TRUE,,1119,"accuracy_with_tie : For each question, a system receives 1 point if it
chooses the correct answer and 1/k if it reports a k-way tie
(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (chal.),qa/multiple-choice qa,,nontrivial_comprehension,,Clark et al. 2018
ai2_arc,ARC-Easy,QA_closed_book,cls,,GPT,,,TRUE,2251,0,TRUE,,2251,"accuracy_with_tie: For each question, a system receives 1 point if it
chooses the correct answer and 1/k if it reports a k-way tie
(i.e., chooses multiple answers) that includes the correct answer.",,https://arxiv.org/pdf/1803.05457.pdf,mid-intensive,,ARC (easy),Multiple choice,,,,
nq_open,,QA_closed_book,gen,,GPT,TRUE,,TRUE,87925,0,,TRUE,0,kilt-exact_match;average_accuracy_accross_answers,TRUE,https://direct.mit.edu/tacl/article/doi/10.1162/tacl_a_00276/43518/Natural-Questions-A-Benchmark-for-Question,intensive,,Natural Questions (open domain),,,trivia,,
kilt_tasks,hotpotqa,QA_closed_book,gen,recast as closed-book due to input length,self,,TRUE,,88869,88869,,,,,,,,,kilt hotpotqa,qa/closed-book qa,,encyclopedia; multi-hop QA,,Yang et al. 2018
trivia_qa,unfiltered,QA_closed_book,gen,,GPT,TRUE,,TRUE,87622,0,TRUE,,87622,exact_match;f1_over_words => wikipedia aliases are considered valid answers,TRUE,https://arxiv.org/pdf/1705.03551.pdf,intensive,,Trivia QA,,,,,
web_questions,,QA_closed_book,gen,"""supposed to be answerable by Freebase"" Check corpora deduplication with freebaseqa.",GPT,,,TRUE,3778,0,TRUE,,3778,accuracy : they don't mention how they normalize across multiple correct answers,TRUE,https://aclanthology.org/D13-1160.pdf,intensive,,web questions,qa/closed-book qa,,,,Berant et al. 2013
wiki_qa,,QA_closed_book,cls,,CrossFit,,TRUE,,20360,20360,,,,,,https://aclanthology.org/D15-1237.pdf,,,wiki qa,cls/other,,,,Yang et al. 2015
adversarial_qa,dbidaf,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,https://aclanthology.org/2020.tacl-1.43/,,,adversarialqa,qa/machine reading comprehension,,,,Bartolo et al. 2020
adversarial_qa,dbert,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
adversarial_qa,droberta,QA_extractive,ext,,,TRUE,TRUE,,10000,10000,TRUE,,10000,,,,,,,,,,,
coqa,,QA_extractive,ext,GPT-easy,GPT,,,TRUE,7199,,,,,"macro_average_f1: for computing a model’s performance, each individual prediction is compared
against n human answers resulting in n F1 scores,
the maximum of which is chosen as the prediction’s
F1.For each question, we average out F1 across
these n sets, both for humans and models. In our
final evaluation, we use n = 4 human answers for
every question (the original answer and 3 additionally collected answers). The articles a, an and the
and punctuations are excluded in evaluation.",from the paper it seems it could contain multiple answers but the datasets has only one answer per question,https://arxiv.org/pdf/1808.07042.pdf,,,,,,,,
duorc,SelfRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,60721,60721,,,,,,https://duorc.github.io/,,,DuoRC,qa/machine reading comprehension,,,Wikipedia/IMDB crowd,Saha et al. 2018
duorc,ParaphraseRC,QA_extractive,ext,,TaskEmbed;CrossFit,,TRUE,,69524,69524,,,,,,https://arxiv.org/pdf/1804.07927.pdf,,,DuoRC,paraphrased QA,,,,Saha et al. 2018
ropes,,QA_extractive,ext,,,TRUE,TRUE,,10924,10924,TRUE,,10924,,,,modest,,ropes,Extractive QA,,cause_and_effect;nontrivial_comprehension,,Lin et al. 2019
squad_v2,,QA_extractive,ext,,GPT,,,TRUE,130319,0,TRUE,,130319,exact_match;f1_score,TRUE,https://arxiv.org/pdf/1806.03822.pdf,,,SQuAD 2.0,Extractive QA,,,,Rajpurkar et al. 2018
super_glue,record,QA_extractive,ext,,,TRUE,,TRUE,100730,0,TRUE,TRUE,100730,max_token_level_f1;exact_match,TRUE,https://arxiv.org/pdf/1810.12885.pdf,,,superglue-record,qa/machine reading comprehension,,knowledge-? reading comprehension,,Zhang et al. 2018
qa_srl,,QA_extractive,ext,"need non-naive metric (""If the predicted word is contained inside the annotated answer span it is considered a correct prediction.""); v2 not in HF https://aclanthology.org/P18-1191.pdf",Eval WG,,,TRUE,6414,0,TRUE,TRUE,6414,accuracy,TRUE,https://dada.cs.washington.edu/qasrl/#page-top,neutral,,qa srl,other,,semantic role,,He et al. 2015
quac,,QA_extractive,ext,,GPT,,,TRUE,11567,,,,,"average_maximum_f1;HEQ-Q;HEQ-D:  To make oracle human and system performance comparable,
given n references, we report the average of the
maximum F1 computed from each n − 1 subset
with respect to the heldout reference.",TRUE,https://arxiv.org/pdf/1808.07036.pdf,,,,,,dialogue,,
quoref,,QA_extractive,ext,,,TRUE,TRUE,,19399,19399,TRUE,,19399,,,https://aclanthology.org/D19-1606.pdf,,,Quoref,Extractive QA,,,,Dasigi et al. 2019
tydiqa,,QA_extractive,ext,,Eval WG,,TRUE,,9211,9211,,,,,,,,,,,,,,
drop,,QA_generative,gen,"nontrivial math; try history_690, it's pretty hard even when I have domain knowledge",GPT,TRUE,,TRUE,,,,,,exact_match; macro_average_f1,TRUE,https://aclanthology.org/N19-1246.pdf,,,DROP ,multi-hop quantitative reasoning; Abstractive QA,,numerical,Wikipedia crowd,Dua et al. 2019
cos_e,v1.11,QA_multiple_choice,cls,"same as commonsense_qa but with (poorly sourced) human explanations; questionable ""commonsense"" lots of world knowledge",Vania,TRUE,TRUE,,9741,9741,TRUE,,9741,,,,,,cos e,other/generate explanation,,,,Rajani et al. 2019
cosmos_qa,,QA_multiple_choice,cls,,,TRUE,TRUE,,25262,25262,TRUE,,25262,,,,,,cosmos qa,qa/multiple-choice qa,,,,Huang et al. 2019
dream,,QA_multiple_choice,cls,,,TRUE,TRUE,,6116,6116,TRUE,,6116,,,,,,dream,qa/multiple-choice qa,,,,Sun et al. 2019
openbookqa,main,QA_multiple_choice,cls,interesting combo of pragmatics + scientific reasoning,GPT,,,TRUE,4957,0,TRUE,TRUE,4957,"accuracy_with_tie : For each question, a system receives 1 point if it
chooses the correct answer and 1/k if it reports a k-way tie
(i.e., chooses multiple answers) that includes the correct answer.",,https://aclanthology.org/D18-1260.pdf,modest,,openbookqa,qa/multiple-choice qa,,pragmatics,,Mihaylov et al. 2018
qasc,,QA_multiple_choice,cls,,,TRUE,TRUE,,8134,8134,TRUE,,8134,,,,given?,,qasc,qa/multiple-choice qa,,,,Khot et al. 2020
quail,,QA_multiple_choice,cls,,,TRUE,TRUE,,10246,10246,TRUE,,10246,,,,,,quail,qa/multiple-choice qa,,,,Rogers et al. 2020
quarel,,QA_multiple_choice,cls,,CrossFit,,TRUE,,1941,1941,,,,,,,,,quarel,qa/multiple-choice qa,,logical form,,Tafjord et al. 2019a
quartz,,QA_multiple_choice,cls,,,TRUE,TRUE,,2696,2696,TRUE,,2696,,,https://aclanthology.org/D19-1608.pdf,given?,,quartz-with knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
race,high,QA_multiple_choice,cls,GPT-hard,GPT,,,TRUE,62445,0,TRUE,TRUE,62445,accuracy,,https://arxiv.org/pdf/1704.04683.pdff,neutral,,race-high,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
race,middle,QA_multiple_choice,cls,"revisit: define as comprehension, paragraph level?",GPT,,,TRUE,25421,0,TRUE,TRUE,25421,accuracy,,https://arxiv.org/pdf/1704.04683.pdf,neutral,,race-middle,qa/multiple-choice qa,,knowledge-neutral reading comprehension,,Lai et al. 2017
sciq,,QA_multiple_choice,cls,,,TRUE,TRUE,,11679,11679,TRUE,,11679,,,,,,sciq,qa/multiple-choice qa,,,,Welbl et al. 2017
social_i_qa,,QA_multiple_choice,cls,metric differ by prompt: 4-way classification cast as binary ,,TRUE,TRUE,TRUE,33410,33410,TRUE,TRUE,33410,accuracy,,https://arxiv.org/pdf/1904.09728.pdf,,,SIQA,qa/multiple-choice qa,,cultural knowledge,,Sap et al. 2019
super_glue,boolq,QA_multiple_choice,cls,,,TRUE,,TRUE,9427,0,TRUE,TRUE,9427,accuracy,,https://arxiv.org/pdf/1905.10044.pdf,neutral?,,superglue-boolq,,,knowledge-? reading comprehension,,
super_glue,copa,QA_multiple_choice,cls,,,TRUE,,TRUE,400,0,TRUE,TRUE,400,accuracy,,http://commonsensereasoning.org/2011/papers/Roemmele.pdf,modest,,superglue-copa,qa/multiple-choice qa,,causal cognition,,Gordon et al. 2012
super_glue,multirc,QA_multiple_choice,cls,F1 over all answer options. See paper p. 259 for defintion,,TRUE,,TRUE,27243,0,TRUE,TRUE,27243,f1_over_all_options;exact_match,,https://aclanthology.org/N18-1023.pdf,neutral?,,superglue-multirc,qa/multiple-choice qa,,knowledge-? reading comprehension,,Khashabi et al. 2018
wiki_hop,original,QA_multiple_choice,cls,,,TRUE,TRUE,,43738,43738,TRUE,,43738,,,https://transacl.org/ojs/index.php/tacl/article/viewFile/1325/299,,,WikiHop (Welbl et al. 2018),multi-hop QA,,,Wikipedia KB,
wiqa,,QA_multiple_choice,cls,,,TRUE,TRUE,,29808,29808,TRUE,,29808,,,,,,wiqa,qa/multiple-choice qa,,cause_and_effect,,Tandon et al. 2019
circa,,QA_multiple_choice,cls,revisit: problematic prompts,,,,TRUE,34268,0,,TRUE,0,mean_multiclass_f1;accuracy,,https://arxiv.org/pdf/2010.03450.pdf,,,circa,cls/other,,pragmatics,,Louis et al. 2020
mc_taco,,QA_multiple_choice,cls,no train set; variable number of answer_chocies; eval in paper is over set of possible candidates;,,,,TRUE,0,0,,TRUE,0,exact_match; f1_score,,https://arxiv.org/pdf/1909.03065.pdf,,,mc taco,qa/binary,,temporal cognition,,Zhou et al. 2019
piqa,,QA_multiple_choice,cls,revisit: not just other,GPT,,,TRUE,16113,0,TRUE,,16113,accuracy,,https://arxiv.org/pdf/1911.11641.pdf,,,PIQA,Multiple choice,,physical_cognition,,Bisk et al. 2020
amazon_polarity,,sentiment,cls,,,TRUE,TRUE,,3600000,500000,TRUE,,500000,,,https://cs.stanford.edu/people/jure/pubs/reviews-recsys13.pdf,,,amazon polarity,cls/sentiment analysis,,,,McAuley and Leskovec 2013
app_reviews,,sentiment,cls,,,TRUE,TRUE,,288065,288065,TRUE,,288065,,,,,,app reviews,other/regression,,,,Missing
imdb,,sentiment,cls,,,TRUE,TRUE,,25000,25000,TRUE,,25000,,,,,,imdb,cls/sentiment analysis,,no dev set,,Maas et al. 2011
rotten_tomatoes,,sentiment,cls,,,TRUE,TRUE,,8530,8530,TRUE,,8530,,,,,,rotten tomatoes,cls/sentiment analysis,,,,Pang and Lee 2005
yelp_review_full,,sentiment,cls,no dev set,,TRUE,TRUE,,650000,500000,TRUE,,500000,,,,,,yelp review full,other/regression,,,,Zhang et al. 2015; (link)
lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
craffel/openai_lambada,,story_completion,gen,revisit: story or cloze or coref? trivial cloze prompt; training set is just unlabeled corpora; GPT task,GPT,,,TRUE,0,0,,TRUE,0,accuracy;perplexity;median_rank,,https://arxiv.org/pdf/1606.06031.pdf,,,,,,,,
story_cloze,2016,story_completion,cls,todo: custom loading; swag like?,GPT,,,TRUE,,0,,TRUE,0,accuracy,,https://arxiv.org/pdf/1604.01696.pdf,,,,,,,,
hellaswag,,story_completion,cls,,GPT,,,TRUE,39905,0,TRUE,,39905,accuracy,,https://arxiv.org/pdf/1905.07830.pdf,,,hellaswag,qa/multiple-choice qa,,,,Zellers et al. 2019
common_gen,,structure_to_text,gen,,,TRUE,TRUE,,67389,67389,TRUE,,67389,,,,,,common gen,other,,,,Lin et al. 2020b
wiki_bio,,structure_to_text,gen,,,TRUE,TRUE,,582659,500000,TRUE,,500000,,,,,,wiki bio,cg/other,,,,Lebret et al. 2016
cnn_dailymail,3.0.0,summarization,gen,,,TRUE,TRUE,,287113,287113,TRUE,,287113,,,,,,,,,,,
gigaword,,summarization,gen,,,TRUE,TRUE,,3803957,500000,TRUE,,500000,,,,,,gigaword,cg/summarization,,,,Napoles et al. 2012
multi_news,,summarization,gen,,CrossFit,,TRUE,,44972,44972,,,,,,,,,multi news,cg/summarization,,,,Fabbri et al. 2019
samsum,,summarization,gen,,CrossFit,,TRUE,,14732,14732,,,,,,,,,samsum,cg/summarization,,,,Gliwa et al. 2019
xsum,,summarization,gen,,,TRUE,TRUE,TRUE,204045,204045,TRUE,TRUE,204045,rouge,,https://arxiv.org/pdf/1808.08745.pdf,,,xsum,cg/summarization,,,,Narayan et al. 2018
ag_news,,topic_classification,cls,,,TRUE,TRUE,,120000,120000,TRUE,,120000,,,http://groups.di.unipi.it/~gulli/AG_corpus_of_news_articles.html,,,ag news,cls/topic,,,,Gulli (link)
dbpedia_14,,topic_classification,cls,,,TRUE,TRUE,,560000,500000,TRUE,,500000,,,https://svn.aksw.org/papers/2013/SWJ_DBpedia/public.pdf,,,dbpedia 14,cls/topic,,,,Lehmann et al. 2015
trec,,topic_classification,cls,,,TRUE,TRUE,,5452,5452,TRUE,,5452,,,https://trec.nist.gov/data/qa.html,,,trec,cls/other,,,,Li and Roth 2002; Hovy et al. 2001
super_glue,wic,word_sense_disambiguation,cls,,,TRUE,,TRUE,5428,0,TRUE,TRUE,5428,accuracy,,https://arxiv.org/pdf/1808.09121.pdf,,,superglue-wic,cls/other,,lexical_knowledge,,Pilehvar and Camacho-Collados 2019
Staging Area,,,,,,,,,,,,,,,,,,,,,,,,
Would Include but not in HF or some other practical limitations,,,,,,,,,,,,,,,,,,,,,,,,
definite_pronoun_resolution,,coreference,,todo: download error,,,,,,,,,,,,,,,deﬁnite pronoun resolution,other,,,,Rahman and Ng 2012
jeopardy,,closed-book qa,gen,sporadic download error,CrossFit,,,,,,,,,,,,,promptsource download error,jeopardy,qa/closed-book qa,,,,(link)
blimp,,,cls,no prompts yet; collapse subsets,,,,,,0,,,0,,,,,,,,,,,
Hendrycks et al. 2021,,,,https://arxiv.org/abs/2009.03300v3,,,,,,,,,,,,,,,,,,,,
Multi-Turn Dialogue Reasoning,,,,https://aclanthology.org/2020.acl-main.130.pdf,Vania,,,,7088,,,,,,,,,,,,,,,
Argument Reasoning Comprehension Task,,,,https://aclanthology.org/N18-1175.pdf,Vania,,,,1211,,,,,,,,,,,,,,,
MCScript,,,,https://aclanthology.org/L18-1564.pdf,Vania,,,,14191,,,,,,,,,,,,,,,
narrativeqa,,,,very long input sequence,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,NarQA,Abstractive QA,,,,
newsqa,,,,download error,TaskEmbed,,,,,,,,,,,,,promptsource download error,NewsQA,Extractive QA,,,,Trischler et al. 2017
eli5,,,,dataset split error,CrossFit,,,,,,,,,,,https://facebookresearch.github.io/ELI5/explore.html,,skip: HF datasets error the split field is used for subsets,eli5-askh,qa/long-form qa,,possibly knowledge-neutral,,Fan et al. 2019
Maybe Reconsider,,,,,,,,,,,,,,,,,,,,,,,,
zest,,,,its original task is quite complex (need to provide a decision function); should be held-out eval only,self,,,,,,,,,,,,,,,,,,,
swag,,story_completion,cls,revisit whether this should be considered as a variant of NLI,,,,,73546,0,TRUE,,73546,,,,,,swag,qa/multiple-choice qa,,,,Zellers et al. 2018
codah,codah,story_completion,cls,a variant of swag revisit whether this should be considered as a variant of NLI,,,,,2776,0,TRUE,,2776,,,,,,codah,qa/multiple-choice qa,,,,Chen et al. 2019
wiki_auto,,,,revisit: lots of duplicate simplified text; novel generative task could be very challenging,CrossFit,,,,,,,,,,,,,no prompt yet,wiki auto,cls/other,,text simplification,,Jiang et al. 2020
proto_qa,,,gen,"generate prototypical concepts, kinda niche format with multiple correct answers",CrossFit,,,,,,,,,,,,,no prompt yet,proto qa,other,,,,Boratko et al. 2020
empathetic_dialogues,,,,generation? classification?,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1811.00207.pdf,,no prompt yet,empathetic dialogues,cg/dialogue,,,,Rashkin et al. 2019
qed,,,,uses held-out Natural Questions,,,,,,,,,,,,,,,,,,,,
kilt_tasks,aidayago2,,,,,,,,,,,,,,,,,no prompt yet,kilt ay2,other/entity linking,,encyclopedia,,Hoffart et al. 2011
kilt_tasks,wow,,,,,,,,,,,,,,,,,no prompt yet,kilt wow,cg/dialogue,,encyclopedia,,Dinan et al. 2019
lama,conceptnet,,,,,,,,,,,,,,,,,no prompt yet,lama-conceptnet,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
lama,google_re,,,,,,,,,,,,,,,,,no prompt yet,lama-google re,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
lama,squad,,,,,,,,,,,,,,,,,no prompt yet,lama-squad,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
lama,trex,,,,,,,,,,,,,,,,,no prompt yet,lama-trex,qa/closed-book qa,,encyclopedia,,Petroni et al. 2019 2020
limit,,physical cognition,,,,,,,,,,,,,,https://aclanthology.org/2020.findings-emnlp.88.pdf,,label errors in dataset itself? also no validation set otherwise well motivated by semantic theories,limit,other,,physical semantic repr.,,Manotas et al. 2020
kilt_tasks,fever,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,temporary skip: prompts available in non-benchmark standalone dataset,kilt fever,cls/fact checking,,encyclopedia,,Thorne et al. 2018
Skipped,,,,,,,,,,,,,,,,,,,,,,,,
fever,v2.0,closed-book qa/fact checking,,also in KILT,,,,,,,,,,,,,,skip: awkward prompts as closed-book qa,FEVER,,,,,
hotpot_qa,distractor,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
hotpot_qa,fullwiki,,,also in KILT,,,,,,,,,,,,,,skip for experiment D3: very long input sequence,Hotpot QA,,,,,
emo,,sentiment,cls,skip: offensive and ungrammatical text,,merged,,,30160,0,TRUE,TRUE,30160,precision;recall;F1,,https://aclanthology.org/S19-2005.pdf,,skip: offensive and ungrammatical text,emo,cls/emotion,,,,Chatterjee et al. 2019
freebase_qa,,QA_closed_book,gen,"need to be held out because web_questions is ""supposed to be answerable by Freebase""",,,,,20358,0,TRUE,,20358,,,,intensive,,freebase qa,qa/closed-book qa,,,,Jiang et al. 2019
aqua_rat,,,,,,,,,,,,,,,,https://arxiv.org/abs/1705.04146,,skip: nontrivial math,aqua rat,qa/multiple-choice qa,,nontrivial math,,Ling et al. 2017
math_qa,,,,,,,,,,,,,,,,,,skip: nontrivial math,math qa,qa/multiple-choice qa,,nontrivial math,,Amini et al. 2019
numer_sense,,,,,,,,,,,,,,,,,,skip: closed-book trivia ,numer sense,qa/closed-book qa,,numerical knowledge,,Lin et al. 2020a
squad_adversarial,,,,,,,,,,,,,,,,,,validation set only,,,,,,
squadshifts,,,,,,,,,,,,,,,,,,test set only,,,,,,
sms_spam,,,,,,,,,,,,,,,,,,skip: unclean corpus and likely harmful content,sms spam,cls/other,,,,Almeida et al. 2011
search_qa,,,,,,,,,,,,,,,,,,skip: seems like a very unclean corpus,search qa,qa/closed-book qa,,,,Dunn et al. 2017
kilt_tasks,trex,,,,,,,,,,,,,,,,,skip: non-natural language,kilt trex,qa/closed-book qa,,encyclopedia,,Elsahar et al. 2018
kilt_tasks,structured_zeroshot,,,,,,,,,,,,,,,,,skip: non-natural language,kilt zsre,qa/closed-book qa,,encyclopedia,,Levy et al. 2017
spider,,,,,,,,,,,,,,,,,,skip: non-natural language,spider,cg/other,,,,Yu et al. 2018
wikisql,,,,,,,,,,,,,,,,,,skip: non-natural language,wikisql,cg/other,,,,Zhong et al. 2017
com_qa,,,,,CrossFit,,,,,,,,,,,https://arxiv.org/pdf/1809.09528.pdf,,skip: non-human language: URL,ComQA (Abujabal et al. 2019),factoid QA w/ paraphrases,,,snippets WikiAnswers,
climate_fever,,,,revisit whether this should be considered as a variant of NLI,,,,,,,,,,,,,,skip: no train set,climate fever,cls/fact checking,,,,Diggelmann et al. 2020
art,,,,,,,,,,,,,,,,https://arxiv.org/pdf/1908.05739.pdf,,skip: NLI reserved for generalization studies (although this one is not a traditionally defined NLI),art (abductive nli),other,,,,Bhagavatula et al. 2020
glue,mnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-mnli,cls/nli,,,,Williams et al. 2018
glue,qnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-qnli,cls/nli,,,,Rajpurkar et al. 2016
glue,rte,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-rte,cls/nli,,,,Dagan et al. 2005; Bar-Haim et al. 2006 Giampiccolo et al. 2007; Bentivogli et al. 2009
glue,wnli,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,glue-wnli,cls/nli,,,,Levesque et al. 2012
,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,scitail,cls/nli,,,,Khot et al. 2018
,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,sick,cls/nli,,,,Marelli et al. 2014
,,classification_NLI,,,,,,,,,,,,,,,,skip: NLI reserved for generalization studies,SNLI (Bowman et al. 2015),NLI,,,misc.,
aeslc,,,,summarization by email subject line,,,,,,,,,,,,https://arxiv.org/abs/1906.03497,,skip: niche task,aeslc,cg/summarization,,generation,,Zhang and Tetreault 2019
onestop_english,,,,,,,,,,,,,,,,https://aclanthology.org/W18-0535.pdf,,skip: niche task: classify curriculum diffculty,onestop english,cls/other,,,,Vajjala and Luˇci´c 2018
mocha,,,,,,,,,,,,,,,,,,skip: model generated text,mocha,other/regression,,,,Chen et al. 2020a
commonsense_qa,,,,duplicate with cos_e,Vania,,,,9741,,,,,,,https://arxiv.org/pdf/1811.00937.pdf,,,Commonsense QA,qa/multiple-choice qa,,,,Talmor et al. 2019
,,,,,,,,,,,,,,,,,,skip: maybe harmful content from Twitter,emotion,cls/emotion,,,,Saravia et al. 2018
,,,,the authors themselves seem to have renounced their own work,,,,,,,,,,,,https://github.com/nyu-mll/crows-pairs,,skip: harmful content,crows pairs,other,,,,Nangia et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-directed vs generalized,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-disability,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-gender,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-national origin,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-race,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-religion,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,ethos-sexual orientation,cls/hate speech detection,,,,Mollas et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech offensive,cls/hate speech detection,,,,Davidson et al. 2017
,,,,,,,,,,,,,,,,,,skip: harmful content,hate speech18,cls/hate speech detection,,,,de Gibert et al. 2018
,,,,,,,,,,,,,,,,,,skip: harmful content,hatexplain,cls/hate speech detection,,,,Mathew et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-title,cg/summarization,,,,Kim et al. 2019
,,,,,,,,,,,,,,,,,,skip: harmful content,reddit tifu-tldr,cg/summarization,,,,Kim et al. 2019
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emoji,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-emotion,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-hate,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-irony,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-offensive,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-sentiment,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance abortion,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance atheism,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance climate,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance feminist,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet eval-stance hillary,cls/emotion,,,,Barbieri et al. 2020
,,,,,,,,,,,,,,,,,,skip: harmful content,tweet qa,qa/machine reading comprehension,,,,Xiong et al. 2019
yelp_polarity,,,,,,,,,,,,,,,,,,skip: duplicate with yelp_review_full,yelp polarity,cls/sentiment analysis,,,,Zhang et al. 2015; (link)
quora,,,,,,,,,,,,,,,,https://quoradata.quora.com/First-Quora-Dataset-Release-Question-Pairs,,skip: duplicate under GLUE,QQP,paraphrase identiﬁcation,,,social QA,Iyer et al. 2017
squad,,,,,,,,,,,,,,,,,,skip: duplicate under Squad 2.0,SQuAD 1.1,Extractive QA,,,,
yahoo_answers_topics,,,,,,,,,,,,,,,,,,skip for early experiments: unclean corpus,yahoo answers topics,cls/topic,,,,(link)
tab_fact,,,,,,,,,,,,,,,,,,skip for early experiments: tabular data,tab fact,cls/fact checking,,,,Chen et al. 2020b
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor gender agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-anaphor number agreement,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-determiner noun agreement with adj irregular 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-ellipsis n bar 2,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-existential there quantiﬁers 1,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-irregular past participle adjectives,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi licensor present,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-sentential negation npi scope,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: revisit if we want to include a large number of ungrammatical sentences in our training data,blimp-wh questions object gap,other/linguistic phenomenon,,syntax,,Warstadt et al. 2020
poem_sentiment,,,,,,,,,,,,,,,,,,skip for early experiments: poetry domain,poem sentiment,cls/sentiment analysis,,creativity,,Sheng and Uthus 2020
acronym_identification,,,,,,,,,,,,,,,,https://arxiv.org/pdf/2010.14678.pdf,,skip for early experiments: niche/hard task,acronym identiﬁcation,other,,,,Pouran Ben Veyseh et al. 2020
google_wellformed_query,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,google wellformed query,cls/other,,,,Faruqui and Das 2018
liar,,,,revisit whether to exclude fine-grain regression tasks,,,,,,,,,,,,,,skip for early experiments: niche/hard task,liar,cls/fact checking,,,,Wang 2017
,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,break-QDMR-high-level,other,,semantic representation,,Wolfson et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: niche/hard task,crawl domain,other,,,,Zhang et al. 2020
discovery,discovery,,,,,,,,,,,,,,,,,skip for early experiments: niche task no cannonical answer,discovery,cls/other,,generative-ish,,Sileo et al. 2019
wiki_split,,,,,,,,,,,,,,,,,,skip for early experiments: niche task,wiki split,cg/other,,,,Botha et al. 2018
,,,,,,,,,,,,,,,,,,skip for early experiments: multilingual,aslg pc12,other,,,,Othman and Jemni 2012
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,CCG (Hockenmaier and Steedman 2007),CCG supertagging,,syntax,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Chunk (Tjong Kim Sang and Buchholz 2000),syntactic chunking,,syntax,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Conj (Ficler and Goldberg 2016),conjunct identiﬁcation,,syntax,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GED (Yannakoudakis et al. 2011),grammatical error detection,,syntax,misc.,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GGParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,GParent (Liu et al. 2019a),syntactic tagging,,syntax,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,NER (Tjong Kim Sang and De Meulder 2003),named entity recognition,,,news,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,Parent (Liu et al. 2019a),syntactic tagging,,syntax; constituency,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-EWT (Silveira et al. 2014),part-of-speech tagging,,syntax,Web Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,POS-PTB (Marcus et al. 1993),part-of-speech tagging,,syntax,Penn Treebank,
,,,,,,,,,,,,,,,,,,skip for early experiments: input token/span classification less straightforward for a generative LM,ST (Bjerva et al. 2016),semantic tagging,,,Groningen Meaning Bank,
financial_phrasebank,,,,,,,,,,,,,,,,,,skip for early experiments: financial domain,ﬁnancial phrasebank,cls/sentiment analysis,,,,Malo et al. 2014
health_fact,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,health fact,cls/fact checking,,,,Kotonya and Toni 2020
,,,,,,,,,,,,,,,,http://www.sciencedirect.com/science/article/pii/S1532046412000615,,skip for early experiments: biomedical domain,ade corpus v2-classiﬁcation,cls/other,,,,Gurulingappa et al. 2012
,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-dosage,other/slot ﬁlling,,,,Gurulingappa et al. 2012
,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,ade corpus v2-effect,other/slot ﬁlling,,,,Gurulingappa et al. 2012
,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,biomrc,qa/machine reading comprehension,,,,Pappas et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: biomedical domain,medical questions pairs,cls/paraphrase,,,,McCreery et al. 2020
scicite,,,,,,,,,,,,,,,,,,skip for early experiments: academic domain + niche/hard task,scicite,cls/other,,,,Cohan et al. 2019
,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,break-QDMR,other,,logical form,,Wolfson et al. 2020
,,,,,,,,,,,,,,,,,,skip for early experiments: abstract semantic representations,e2e nlg cleaned,other,,,,Duˇsek et al. 2020 2019
glue,sst2,,,,,,,,,,,,,,,,,revisit: very short and often ill-formed movie reviews,glue-sst2,cls/sentiment analysis,,,,Socher et al. 2013
glue,stsb,fine-grain regression,,,,,,,,,,,,,,,,revisit whether to exclude fine-grain regression tasks,glue-stsb,semantic similarity,,,misc.,
,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-no context,qa/closed-book qa,,,,Rajpurkar et al. 2016
,,,,,,,,,,,,,,,,,,double check: subset missing from HF datasets,squad-with context,qa/machine reading comprehension,,,,Rajpurkar et al. 2016
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,BoolQ-CS,Binary yes/no,,,,
,,,,,,,,,,,,,,,,https://aclanthology.org/C16-1236.pdf,,double check: missing from HF datasets,CQ (Bao et al. 2016),knowledge-based QA,,,snippets web queries/KB,
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,DROP-CS,Abstractive QA,,,,
,,,,,,,,,,,,,,,,https://aclanthology.org/D13-1020.pdf,,double check: missing from HF datasets,MCTest,Multiple choice,,,,
,,,,,,,,,,,,,,,,,,double check: missing from HF datasets,MRPC (Dolan and Brockett 2005),paraphrase identiﬁcation,,,news,
,,,,"""naturally perturbed"" version of BoolQ",,,,,,,,,,,,https://arxiv.org/pdf/2004.04849.pdf,,double check: missing from HF datasets,NP-BoolQ,Binary yes/no,,,,
,,,,,,,,,,,,,,,,https://aclanthology.org/D19-1608.pdf,,double check: missing from HF datasets,quartz-no knowledge,qa/multiple-choice qa,,,,Tafjord et al. 2019b
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,Quoref-CS,Extractive QA,,,,
,,,,contrast sets,,,,,,,,,,,,https://arxiv.org/pdf/2004.02709.pdf,,double check: missing from HF datasets,ROPES-CS,Extractive QA,,,,