IFT6010 project

project proposal

file:./tex/projectproposal.tex
file:./tex/projectbibliography.bib

explain transformer

explain BERT

BERT Neural Network - EXPLAINED! - YouTube

explain PLMs

explain KD

explain tinybert

group KD, pruning and quantization under methods

remove transformer section and pretrained language models as comments

explain metrics

transformers

Neural machine translation by jointly learning to align and translate

ref

arxiv Neural Machine Translation by Jointly Learning to Align and Translate
file:./papers/jointlyLearningToAlignAndTranslate.pdf

@misc{bahdanau2016neural,
      title={Neural Machine Translation by Jointly Learning to Align and Translate},
      author={Dzmitry Bahdanau and Kyunghyun Cho and Yoshua Bengio},
      year={2016},
      eprint={1409.0473},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

The illustrated transformer

ref

The Illustrated Transformer – Jay Alammar – Visualizing machine learning one …

Attention is all you need

ref

arxiv attention is all you need
file:./papers/attentionIsAllYouNeed.pdf

@article{allyouneed,
  author    = {Ashish Vaswani and
               Noam Shazeer and
               Niki Parmar and
               Jakob Uszkoreit and
               Llion Jones and
               Aidan N. Gomez and
               Lukasz Kaiser and
               Illia Polosukhin},
  title     = {Attention Is All You Need},
  journal   = {CoRR},
  volume    = {abs/1706.03762},
  year      = {2017},
  url       = {http://arxiv.org/abs/1706.03762},
  archivePrefix = {arXiv},
  eprint    = {1706.03762},
  timestamp = {Sat, 23 Jan 2021 01:20:40 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/VaswaniSPUJGKP17.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

BERT

ref

{1810.04805} BERT: Pre-training of Deep Bidirectional Transformers for Langua…
file:./papers/1810.04805.pdf

@article{bert,
  author    = {Jacob Devlin and
               Ming{-}Wei Chang and
               Kenton Lee and
               Kristina Toutanova},
  title     = {{BERT:} Pre-training of Deep Bidirectional Transformers for Language
               Understanding},
  journal   = {CoRR},
  volume    = {abs/1810.04805},
  year      = {2018},
  url       = {http://arxiv.org/abs/1810.04805},
  archivePrefix = {arXiv},
  eprint    = {1810.04805},
  timestamp = {Tue, 30 Oct 2018 20:39:56 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1810-04805.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

PTMs a survey

ref

{2003.08271} Pre-trained Models for Natural Language Processing: A Survey file:./papers/2003.08271.pdf

@misc{qiu2020pretrained,
      title={Pre-trained Models for Natural Language Processing: A Survey},
      author={Xipeng Qiu and Tianxiang Sun and Yige Xu and Yunfan Shao and Ning Dai and Xuanjing Huang},
      year={2020},
      eprint={2003.08271},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

model compression

Knowledge Distillation

ref

file:./papers/1503.02531.pdf {1503.02531} Distilling the Knowledge in a Neural Network

@misc{hinton2015distilling,
      title={Distilling the Knowledge in a Neural Network},
      author={Geoffrey Hinton and Oriol Vinyals and Jeff Dean},
      year={2015},
      eprint={1503.02531},
      archivePrefix={arXiv},
      primaryClass={stat.ML}
}

TinyBert

ref

{1909.10351} TinyBERT: Distilling BERT for Natural Language Understanding file:./papers/1909.10351.pdf

@article{tinybert,
  author    = {Xiaoqi Jiao and
               Yichun Yin and
               Lifeng Shang and
               Xin Jiang and
               Xiao Chen and
               Linlin Li and
               Fang Wang and
               Qun Liu},
  title     = {TinyBERT: Distilling {BERT} for Natural Language Understanding},
  journal   = {CoRR},
  volume    = {abs/1909.10351},
  year      = {2019},
  url       = {http://arxiv.org/abs/1909.10351},
  archivePrefix = {arXiv},
  eprint    = {1909.10351},
  timestamp = {Fri, 27 Sep 2019 13:04:21 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1909-10351.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

Compression of Deep Learning Models For Text: A Survey

$./tex/illustrations/modelcompressiontaxonomy.png$

ref

file:./papers/2008.05221.pdf {2008.05221} Compression of Deep Learning Models for Text: A Survey

@misc{gupta2020compression,
      title={Compression of Deep Learning Models for Text: A Survey},
      author={Manish Gupta and Puneet Agrawal},
      year={2020},
      eprint={2008.05221},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}

Block-Sparse recurrent Neural Networks

{1711.02782} Block-Sparse Recurrent Neural Networks

@article{blocksparse
  author    = {Sharan Narang and
               Eric Undersander and
               Gregory F. Diamos},
  title     = {Block-Sparse Recurrent Neural Networks},
  journal   = {CoRR},
  volume    = {abs/1711.02782},
  year      = {2017},
  url       = {http://arxiv.org/abs/1711.02782},
  archivePrefix = {arXiv},
  eprint    = {1711.02782},
  timestamp = {Mon, 13 Aug 2018 16:48:36 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1711-02782.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

what does bert look at

ref

{1906.04341} What Does BERT Look At? An Analysis of BERT’s Attention

@article{whatdoesbertlookat,
  author    = {Kevin Clark and
               Urvashi Khandelwal and
               Omer Levy and
               Christopher D. Manning},
  title     = {What Does {BERT} Look At? An Analysis of BERT's Attention},
  journal   = {CoRR},
  volume    = {abs/1906.04341},
  year      = {2019},
  url       = {http://arxiv.org/abs/1906.04341},
  archivePrefix = {arXiv},
  eprint    = {1906.04341},
  timestamp = {Fri, 14 Jun 2019 09:38:24 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1906-04341.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

tasks for benchmarking

GLUE:

GLUE Benchmark file:./papers/glue.pdf

@article{glue,
  author    = {Rowan Zellers and
               Yonatan Bisk and
               Roy Schwartz and
               Yejin Choi},
  title     = {{SWAG:} {A} Large-Scale Adversarial Dataset for Grounded Commonsense
               Inference},
  journal   = {CoRR},
  volume    = {abs/1808.05326},
  year      = {2018},
  url       = {http://arxiv.org/abs/1808.05326},
  archivePrefix = {arXiv},
  eprint    = {1808.05326},
  timestamp = {Wed, 23 Dec 2020 10:37:10 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1808-05326.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

Squad

The Stanford Question Answering Dataset {1806.03822} Know What You Don’t Know: Unanswerable Questions for SQuAD file:./papers/1806.03822.pdf

@article{squad,
  author    = {Pranav Rajpurkar and
               Robin Jia and
               Percy Liang},
  title     = {Know What You Don't Know: Unanswerable Questions for SQuAD},
  journal   = {CoRR},
  volume    = {abs/1806.03822},
  year      = {2018},
  url       = {http://arxiv.org/abs/1806.03822},
  archivePrefix = {arXiv},
  eprint    = {1806.03822},
  timestamp = {Mon, 13 Aug 2018 16:48:21 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1806-03822.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

SWAG

{1808.05326} SWAG: A Large-Scale Adversarial Dataset for Grounded Commonsense… file:./papers/1808.05326.pdf

@article{swag,
  author    = {Rowan Zellers and
               Yonatan Bisk and
               Roy Schwartz and
               Yejin Choi},
  title     = {{SWAG:} {A} Large-Scale Adversarial Dataset for Grounded Commonsense
               Inference},
  journal   = {CoRR},
  volume    = {abs/1808.05326},
  year      = {2018},
  url       = {http://arxiv.org/abs/1808.05326},
  archivePrefix = {arXiv},
  eprint    = {1808.05326},
  timestamp = {Wed, 23 Dec 2020 10:37:10 +0100},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1808-05326.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

multi hop reasoning

{1905.05460} Cognitive Graph for Multi-Hop Reading Comprehension at Scale file:./papers/1905.05460.pdf

@article{multihop,
  author    = {Ming Ding and
               Chang Zhou and
               Qibin Chen and
               Hongxia Yang and
               Jie Tang},
  title     = {Cognitive Graph for Multi-Hop Reading Comprehension at Scale},
  journal   = {CoRR},
  volume    = {abs/1905.05460},
  year      = {2019},
  url       = {http://arxiv.org/abs/1905.05460},
  archivePrefix = {arXiv},
  eprint    = {1905.05460},
  timestamp = {Tue, 28 May 2019 12:48:08 +0200},
  biburl    = {https://dblp.org/rec/journals/corr/abs-1905-05460.bib},
  bibsource = {dblp computer science bibliography, https://dblp.org}
}

millenialSpirou/ift6010

IFT6010 project

project proposal

explain transformer

explain BERT

explain PLMs

explain KD

explain tinybert

group KD, pruning and quantization under methods

remove transformer section and pretrained language models as comments

explain metrics

transformers

Neural machine translation by jointly learning to align and translate

ref

The illustrated transformer

ref

Attention is all you need

ref

BERT

ref

PTMs a survey

ref

model compression

Knowledge Distillation

ref

TinyBert

ref

Compression of Deep Learning Models For Text: A Survey

ref

Block-Sparse recurrent Neural Networks

what does bert look at

ref

tasks for benchmarking

GLUE:

Squad

SWAG

multi hop reasoning