synthetic_attention

Usage

batch_size, max_length, d_model = 10, 100, 128

layer = SyntheticAttention(d_model, max_length)

x = torch.randn(batch_size, max_length - 10, d_model)

output = layer(x)

Reference

@misc{tay2021synthesizer,
      title={Synthesizer: Rethinking Self-Attention in Transformer Models}, 
      author={Yi Tay and Dara Bahri and Donald Metzler and Da-Cheng Juan and Zhe Zhao and Che Zheng},
      year={2021},
      eprint={2005.00743},
      archivePrefix={arXiv},
      primaryClass={cs.CL}
}