attentive attention of hierarchical attention network
charlesfufu opened this issue · 6 comments
it seems that the way you implement of attention mechanism is different from original paper, can you give more ideas?
不好意思,读了你的HAN_model.py代码感觉你的代码不太完整,缺少了textRNN.accuracy, textRNN.predictions, textRNN.W_projection这些部分。而且textRNN.input_y:没有定义。还有Attention求权重的方法好像和论文原著不太一样,论文中好像接入了个softmax在和隐藏层相乘累加。
请问能大概介绍一下你文章的思路吗?有点云里雾里的。对word级别的为什么要写成每篇文章的第一句,每篇文章的第二句这样循环输入呢?最后的Loss是什么意思?
hi. han_model is a replicate of AI_LAW project which I used for predict crimes,relevant laws, time of inprisonment(how long will stay in prison) given facts.
suppose you have a document which contains multiple sentences(e.g. 10 sentences). for each sentence, we will get representation(as a vector) using bi-lstm. (low level)
after we done that, we will have get a sequences, that is ten sentence representations.
for this new sequence,length is 10, we will use another bi-lstm to encode it.(high level)
so for AI_law, it is a joint model, one input, it has several outputs, each output will associate with a loss.
check more, by check: https://github.com/brightmart/ai_law
you can use this for attentive attention:
-
vallina version of score function:
def attention_additive(input_sequence,attention_level,reuse_flag=False): #check: paper 'Neural Machine Transation By Jointly Learning To Align and Translate'"""
:param input_sequence: [num_units,1]
:param attention_level: word or sentence level
:return: [batch_size,hidden_size]
"""
attention_representation=Nonenum_units=100 #input_sequence.get_shape().as_list()[-1] #get last dimension
with tf.variable_scope("attention_" + str(attention_level),reuse=reuse_flag):
attention_vector = tf.get_variable("attention_vector_" + attention_level, shape=[num_units,1])#,initializer=self.initializer)
W=tf.get_variable("W" + attention_level, shape=[num_units,num_units])#,initializer=self.initializer)
U=tf.get_variable("U" + attention_level, shape=[num_units,num_units])#,initializer=self.initializer)
v = tf.get_variable("v" + attention_level, shape=[1,num_units])#,initializer=self.initializer)
part1=tf.matmul(W,input_sequence) #[num_units,1]<----([num_units,num_units],[num_units,1])
part2=tf.matmul(U,attention_vector) #[num_units,1]<-----([num_units,num_units],[num_units,1])
activation=tf.nn.tanh(part1+part2) #[num_units,1]
result=tf.matmul(v,activation) # [1,1]<------([1,num_units],[num_units,1])
result=tf.reshape(result,()) #scalar
return result
- verson two of score function: when input is a sequence:
def attention_additive_sequences(input_sequence,attention_level,reuse_flag=False): #check: paper 'Neural Machine Transation By Jointly Learning To Align and Translate'
"""
:param input_sequence: [num_units,sequence_length]
:param attention_level: word or sentence level
:return: [batch_size,hidden_size]
"""
attention_representation=None
sequence_lenghth=input_sequence.get_shape().as_list()[-1]
print("sequence_lenghth:",sequence_lenghth)
num_units=100 #input_sequence.get_shape().as_list()[-1] #get last dimension
with tf.variable_scope("attention_" + str(attention_level),reuse=reuse_flag):
attention_vector = tf.get_variable("attention_vector_" + attention_level, shape=[num_units,1])#,initializer=self.initializer)
W=tf.get_variable("W" + attention_level, shape=[num_units,num_units])#,initializer=self.initializer)
U=tf.get_variable("U" + attention_level, shape=[num_units,num_units])#,initializer=self.initializer)
v = tf.get_variable("v" + attention_level, shape=[1,num_units])#,initializer=self.initializer)
part1=tf.matmul(W,input_sequence) #[num_units,sequence_length]<----([num_units,num_units],[num_units,sequence_length])
part2=tf.matmul(U,attention_vector) #[num_units,1]<-----([num_units,num_units],[num_units,1])
activation=tf.nn.tanh(part1+part2) #[num_units,sequence_length]
result=tf.matmul(v,activation) # [1,sequence_length]<------([1,num_units],[num_units,sequence_length])
result=tf.squeeze(result)
return result
-
version tree for additive attention(score function-->context vector): in gpu version with batch input
def attention_additive_batch(self,input_sequences_original, attention_level,reuse_flag=False): #TODO check: paper 'Neural Machine Transation By Jointly Learning To Align and Translate' """ additive attention(support batch of input with sequences) :param input_sequence: [batch_size,seq_length,num_units] :param attention_level: word or sentence level :return: #[batch_size,sequence_length] """ # [batch_size,seq_length,num_units*2]. input_sequences=tf.transpose(input_sequences_original,perm=[0,2,1]) #[batch_size,num_units,sequence_length]<---[batch_size,seq_length,num_units]. _, num_units, sequence_lenghth = input_sequences.get_shape().as_list() print("###attention_additive_batch.input_sequences:",input_sequences,";attention_level:",attention_level,"num_units:", num_units, ";sequence_lenghth:", sequence_lenghth) with tf.variable_scope("attention_" + str(attention_level), reuse=reuse_flag): # 1.create or get learnable variables attention_vector = tf.get_variable("attention_vector_" + attention_level,shape=[num_units, 1],initializer=self.initializer) W = tf.get_variable("W" + attention_level,shape=[1, num_units, num_units],initializer=self.initializer) U = tf.get_variable("U" + attention_level, shape=[num_units, num_units],initializer=self.initializer) v = tf.get_variable("v" + attention_level, shape=[1, 1, num_units],initializer=self.initializer) # 2.get part1 and part2 of additive attention W = tf.tile(W, (self.batch_size, 1, 1)) # [batch_size,num_units,num_units] part1 = tf.matmul(W,input_sequences) # [batch_size,num_units,sequence_length]<----([batch_size,num_units,num_units],[batch_size,num_units,sequence_length]) part2 = tf.expand_dims(tf.matmul(U, attention_vector),axis=0) # [1,num_units,1]<---[num_units,1]<-----([num_units,num_units],[num_units,1]) # 3.activation activation = tf.nn.tanh(part1 + part2) # [batch_size,num_units,sequence_length] # 4. get attention score by using matmul v = tf.tile(v, (self.batch_size, 1, 1)) # [batch_size,1,num_units] score = tf.matmul(v,activation) # [batch_size,1,sequence_length]<------([batch_size,1,num_units],[batch_size,num_units,sequence_length]) score = tf.squeeze(score) # [batch_size,sequence_length] # 5. normalize using softmax weights=tf.nn.softmax(score,axis=1) #[batch_size,sequence_length] # 6. weighted sum weights=tf.expand_dims(weights,axis=-1) #[batch_size,sequence_length,1] result=tf.multiply(input_sequences_original,weights) #[batch_size,squence_length,num_units] result=tf.reduce_sum(result,axis=1) #[batch_size,num_units] return result # [batch_size,num_units]
for multiplication attention:
def attention_multiply(self,input_sequences,attention_level,reuse_flag=False): #TODO need update
"""
:param input_sequence: [batch_size,seq_length,num_units]
:param attention_level: word or sentence level
:return: [batch_size,hidden_size]
"""
num_units=input_sequences.get_shape().as_list()[-1] #get last dimension
with tf.variable_scope("attention_" + str(attention_level),reuse=reuse_flag):
v_attention = tf.get_variable("u_attention" + attention_level, shape=[num_units],initializer=self.initializer)
#1.one-layer MLP
u=tf.layers.dense(input_sequences,num_units,activation=tf.nn.tanh,use_bias=True) #[batch_size,seq_legnth,num_units].no-linear
#2.compute weight by compute simility of u and attention vector v
score=tf.multiply(u,v_attention) #[batch_size,seq_length,num_units]. TODO NEED ADD multiply SCALE V_a.
score=tf.reduce_sum(score,axis=2,keepdims=True)/tf.sqrt(tf.cast(num_units,tf.float32)) #[batch_size,seq_length,1]
weight=tf.nn.softmax(score,axis=1) #[batch_size,seq_length,1]
#3.weight sum
attention_representation=tf.reduce_sum(tf.multiply(input_sequences,weight),axis=1) #[batch_size,num_units]
return attention_representation