CyberZHG/keras-radam

Very slow implementation

MaximusMutschler opened this issue · 1 comments

Describe the Bug
After doing some tests with your tensorflow implementation in training.py I realized that your code is unexpectedly slow. I tested RADAM on a ResNet32 trained on CIFAR-10 with the following configuration: Nvidia RTX 2080 ti, Tensorflow 1.15, cuda 10.0, CUDNN 7.6.4.
Your RADAM implementation achieved 2510 train steps per minute.
The tendorflow ADAM implementation achieved 3088 train steps per minute.
I then reimplemented RADAM in a very basic way and achieved 2840 train steps per minute.
Unfortunately, I have no idea why your implementation is that slow.

My basic RADAM implementation for reference:

import tensorflow as tf
class RAdamOptimizer(tf.train.Optimizer):

def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8, use_locking=True):


    super().__init__(use_locking, "RAdam")
    self.learning_rate = learning_rate

    self.beta_1 = float(beta1)
    self.beta_2 = float(beta2)
    self.epsilon = float(epsilon)
    self.roh_inf = 2.0 / (1.0 - beta2) - 1.0
    print("roh_inf=", self.roh_inf)

def apply_gradients(self, grad_var_tuples, global_step, name="train_optimizer"):

    self._train_vars = [x[1] for x in grad_var_tuples]
    self._grads = [x[0] for x in grad_var_tuples]
    return self._initialize_train_ops(global_step)

def _initialize_train_ops(self, global_step, name="train_optimizer"):

    if global_step is None:
        self._global_step = tf.Variable(1.0, trainable=False, name="global_step", dtype=tf.float32)
    else:
        self._global_step = global_step
    self._increase_global_step_op = tf.assign(self._global_step, self._global_step + 1)
    time_step = tf.cast(self._global_step, dtype=tf.float32)
    with tf.variable_scope("RAdam_Variables"):

        self._m_hat_ops = []
        self._v_vars_ops = []
        for grad, var in zip(self._grads, self._train_vars):
            new_var_1 = tf.Variable(tf.zeros(var.shape), trainable=False, name=grad.name[0:-2] + "_m")
            new_var_2 = tf.Variable(tf.zeros(var.shape), trainable=False, name=grad.name[0:-2] + "_v")

            m_op = new_var_1.assign(self.beta_1 * new_var_1 + (1.0 - self.beta_1) * grad,
                                    use_locking=self._use_locking)
            v_op = new_var_2.assign(self.beta_2 * new_var_2 + (1.0 - self.beta_2) * tf.multiply(grad, grad),
                                    use_locking=self._use_locking)
            m_hat_op = m_op / (1.0 - tf.pow(self.beta_1, time_step))
            self._m_hat_ops.append(m_hat_op)
            self._v_vars_ops.append(v_op)

    with tf.name_scope("Weight_Update_Operators"):
        self.weight_vars_assign_ops = []
        roh_t = self.roh_inf - 2.0 * time_step * tf.pow(self.beta_2, time_step) / (
                    1.0 - tf.pow(self.beta_2, time_step))
        r_t = tf.sqrt(((roh_t - 4.0) * (roh_t - 2.0) * self.roh_inf) / ((self.roh_inf - 4.0) * (
                    self.roh_inf - 2.0) * roh_t))  # r_t is nan if roh_t <4 -> wanted bahavior

        def roh_t_greater_4(m_hat_op, v_op):
            v_hat_op = tf.sqrt(v_op / (1 - tf.pow(self.beta_2, time_step)))
            update = -self.learning_rate * r_t * m_hat_op / (v_hat_op + self.epsilon)
            return update

        def roh_t_se_4(m_hat_op):
            update = -self.learning_rate * m_hat_op
            return update

        for weight_matrix, m_hat_op, v_op in zip(self._train_vars, self._m_hat_ops, self._v_vars_ops):
            update = tf.cond(tf.greater(roh_t, 5), lambda: roh_t_greater_4(m_hat_op, v_op),
                             lambda: roh_t_se_4(m_hat_op))
            ass_op = tf.assign_add(weight_matrix, update, use_locking=self._use_locking)
            self.weight_vars_assign_ops.append(ass_op)
        with tf.control_dependencies([self._increase_global_step_op]):
            a = tf.group(self.weight_vars_assign_ops)
        return a

def minimize(self, loss_tensor, global_step=None):
    self._train_vars = tf.trainable_variables()
    self._grads = tf.gradients(loss_tensor, self._train_vars, colocate_gradients_with_ops=True)

    return self._initialize_train_ops(global_step)
stale commented

Is this still relevant? If so, what is blocking it? Is there anything you can do to help move it forward?

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs.