Typo in chapter 15 multi_head_attention function
Opened this issue · 0 comments
tangbiao commented
The calculation should use the projected vectors?
def multi_head_attention(query, key, value):
head_outputs = []
for i in range(num_heads):
query = query_dense[i](query)
key = key_dense[i](key)
value = value_dense[i](value)
scores = np.einsum("btd,bsd->bts", query, key)
scores = softmax(scores / math.sqrt(head_dim), axis=-1)
head_output = np.einsum("bts,bsd->btd", scores, key)
head_outputs.append(head_output)
outputs = ops.concatenate(head_outputs, axis=-1)
return output_dense(outputs)