tensor mask shape may be different with tensor matmul_qk shape
ethanchiu7 opened this issue · 0 comments
Im reading this code, and wondering as follows
The tensor mask shape maybe :[batch, 1, seq_len, seq_len]
by the code
`def get_padding_mask(seq):
with tf.name_scope("Padding_Mask"):
seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
# add extra dimensions to add the padding
# to the attention logits.
# TODO by Ethan 2020-09-11, 周五, 23:1:here should be like this ? return seq[:, tf.newaxis, :]
return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)
def attention_mask(size):
"""
if size is 4 then it returns below matrix
[[0., 1., 1., 1.],
[0., 0., 1., 1.],
[0., 0., 0., 1.],
[0., 0., 0., 0.]]
"""
with tf.name_scope("attention_mask"):
mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
return mask # (seq_len, seq_len)
def create_masks(inp):
with tf.name_scope("att_masking"):
att_mask = attention_mask(tf.shape(inp)[1])
padding_mask = get_padding_mask(inp)
TODO by Ethan 2020-09-11, 周五, 23:13: this shape will be [batch, 1, seq_len, seq_len]
mask = tf.maximum(padding_mask, att_mask)
return mask`
However, the tensor matmul_qk shape maybe :[batch, seq_len_q, seq_len_k]
by the code
` def multihead_attention(self, q, k, v, training, mask=None):
matmul_qk = tf.matmul(q, k, transpose_b=True) # (..., seq_len_q, seq_len_k)
if self.scale:
dk = tf.cast(tf.shape(k)[-1], tf.float32)
matmul_qk = matmul_qk / tf.math.sqrt(dk)
if mask is not None:
matmul_qk += (mask * -1e9)`
Im not sure. Somebody help me