Also, I'm reading this again and isn't there supposed to be a tf.nn.compute_average_loss before calculating and applying the gradient?
|
with tf.GradientTape() as tape: |
|
actions, log_pis = self._policy.actions_and_log_probs(observations) |
|
|
|
Qs_log_targets = tuple( |
|
Q.values(observations, actions) for Q in self._Qs) |
|
Q_log_targets = tf.reduce_min(Qs_log_targets, axis=0) |
|
|
|
policy_losses = self._alpha * log_pis - Q_log_targets |
|
|
|
tf.debugging.assert_shapes(( |
|
(actions, ('B', 'nA')), |
|
(log_pis, ('B', 1)), |
|
(policy_losses, ('B', 1)), |
|
)) |
|
|
|
policy_gradients = tape.gradient( |
|
policy_losses, self._policy.trainable_variables) |
|
|
|
self._policy_optimizer.apply_gradients(zip( |
|
policy_gradients, self._policy.trainable_variables)) |
Originally posted by @externalhardrive in #136 (comment)