We propose a generalization of leaderboards, bidimensional leaderboards (Billboards), that simultaneously drives progress in language generation tasks and their evaluation. We accept two types of submissions:
- Generator developers submit output text. A Billboard computes all metric scores.
- Metric developers submit an executable program. A Billboard computes correlations with the human judgments, updates the ensemble metric, and measures how much it overrates machine over human generations.
Anonymous submissions are allowed!!
Submission guides and examples are available here.
Scoring results for all past public submissions are available here.
We have generator-name||metric-name.csv
files from the Cartesian product between the generators and metrics: each contains instance-level scores.
@inproceedings{kasai2022billboard,
author = {Jungo Kasai and
Keisuke Sakaguchi and
Ronan Le Bras and
Lavinia Dunagan and
Jacob Morrison and
Alexander R. Fabbri and
Yejin Choi and
Noah A. Smith},
title = {Bidimensional Leaderboards: Generate and Evaluate Language Hand in
Hand},
year = {2022},
url = {https://arxiv.org/abs/2112.04139},
booktitle={Proc.\ of NAACL},
}
@inproceedings{kasai2021thumb,
title = {Transparent Human Evaluation for Image Captioning},
author = {Jungo Kasai and Keisuke Sakaguchi and Lavinia Dunagan and Jacob Morrison and Ronan Le Bras and Yejin Choi and Noah A. Smith},
year = {2022},
booktitle = {Proc.\ of NAACL},
url = {https://arxiv.org/abs/2111.08940},
}
@article{fabbri2021summeval,
title = {{SummEval}: Re-evaluating Summarization Evaluation},
author = {Fabbri, Alexander R and Kry{\'s}ci{\'n}ski, Wojciech and McCann, Bryan and Xiong, Caiming and Socher, Richard and Radev, Dragomir},
journal = {TACL},
year = {2021},
url = {https://arxiv.org/abs/2007.12626},
}
@misc{freitag2021experts,
title={Experts, Errors, and Context: A Large-Scale Study of Human Evaluation for Machine Translation},
author={Markus Freitag and George Foster and David Grangier and Viresh Ratnakar and Qijun Tan and Wolfgang Macherey},
year={2021},
url={https://arxiv.org/abs/2104.14478},
}