SAC gradients weighted inconsistently

Also, I'm reading this again and isn't there supposed to be a tf.nn.compute_average_loss before calculating and applying the gradient?

softlearning/softlearning/algorithms/sac.py

Lines 214 to 233 in 84d7589

with tf.GradientTape() as tape:

actions, log_pis = self._policy.actions_and_log_probs(observations)

Qs_log_targets = tuple(

Q.values(observations, actions) for Q in self._Qs)

Q_log_targets = tf.reduce_min(Qs_log_targets, axis=0)

policy_losses = self._alpha * log_pis - Q_log_targets

tf.debugging.assert_shapes((

(actions, ('B', 'nA')),

(log_pis, ('B', 1)),

(policy_losses, ('B', 1)),

))

policy_gradients = tape.gradient(

policy_losses, self._policy.trainable_variables)

self._policy_optimizer.apply_gradients(zip(

policy_gradients, self._policy.trainable_variables))

Originally posted by @externalhardrive in #136 (comment)

	with tf.GradientTape() as tape:
	actions, log_pis = self._policy.actions_and_log_probs(observations)

	Qs_log_targets = tuple(
	Q.values(observations, actions) for Q in self._Qs)
	Q_log_targets = tf.reduce_min(Qs_log_targets, axis=0)

	policy_losses = self._alpha * log_pis - Q_log_targets

	tf.debugging.assert_shapes((
	(actions, ('B', 'nA')),
	(log_pis, ('B', 1)),
	(policy_losses, ('B', 1)),
	))

	policy_gradients = tape.gradient(
	policy_losses, self._policy.trainable_variables)

	self._policy_optimizer.apply_gradients(zip(
	policy_gradients, self._policy.trainable_variables))