CyberZHG/keras-radam

Numerical instability with higher learning rates & larger networks

ecarlson156 opened this issue · 3 comments

Describe the Bug
Numerical instability with learning rate 0.1, on CIFAR-10 with simple VGG7 architecture. Using TF 2.0 Keras.
Problem does not exist with learning rate 0.001

When using a simple dummy architecture with 1 layer, this problem does not exist.

Training Output
Using RAdam on VGG7:
"Epoch 1, Loss: nan, Accuracy: 10.010000228881836, Test Loss: nan, Test Accuracy: 10.0"

Using RAdam on dummy 1 layer:
"Epoch 1, Loss: 13058.6318359375, Accuracy: 10.107999801635742, Test Loss: 2.3037562370300293, Test Accuracy: 10.0"

When using Adam:
"Epoch 1, Loss: 14431383552.0, Accuracy: 9.960000038146973, Test Loss: 2.3032336235046387, Test Accuracy: 10.0"

Version Info
tensorflow-gpu 2.0.0-beta1

Minimal Codes To Reproduce

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.datasets import cifar10
from keras_radam.optimizer_v2 import RAdam
from tensorflow.keras import Model
import numpy as np
from tensorflow.keras.layers import Dense, Flatten, Conv2D, BatchNormalization, MaxPooling2D, AvgPool2D, ReLU

import argparse
import sys

# parse CLI
parser = argparse.ArgumentParser()
parser.add_argument("--batch_size", type=int, default=128)
parser.add_argument("--epochs", type=int, default=150)
parser.add_argument("--learning_rate", type=float, default=0.1)
parser.add_argument("--opt", type=str, default='radam')
parser.add_argument("--cfg", type=int, default=7)
args = parser.parse_args(sys.argv[1:])

cfgs = {
7: [128, 128, 'M', 256, 256, 'M', 512, 512, 'M'],
1: [128, 'M']
}


class VGG(Model):
    def __init__(self, cfg=7, batch_norm=True):
        print(cfg)
        super(VGG, self).__init__()
        self.mylayers = []
        for c in cfgs[cfg]:
            if c == 'M':
                self.mylayers.append(MaxPooling2D(strides=(2,2)))
            else:
                self.mylayers.append(Conv2D(c, (3,3), padding='same'))
                if batch_norm and len(self.mylayers) > 1:
                    self.mylayers.append(BatchNormalization())
                self.mylayers.append(ReLU())
        self.mylayers.append(Flatten())
        self.mylayers.append(Dense(10))
        self.mylayers.append(tf.keras.layers.Softmax())

    def _get_layers(self):
        weights=[]
        for layer in self.layers:
            weights.append(layer.get_weights())
        return weights


    def call(self, x):
        for layer in self.mylayers:
            x = layer(x)
        return x

def make_model(**kwargs):
    a = tf.keras.layers.Input(shape=(32,32,3))
    model = VGG(**kwargs)
    b=model(a)
    return tf.keras.models.Model(inputs=a, outputs=b)


#create the model
model = make_model(cfg=args.cfg)

# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = cifar10.load_data()

# Convert class vectors to binary class matrices.
num_classes = 10
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')

# create dataset objects
train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(args.batch_size)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(1000).batch(args.batch_size)

# declare loss functions
loss_object = tf.keras.losses.CategoricalCrossentropy()

if args.opt == 'adam':
    optimizer = tf.keras.optimizers.Adam(learning_rate=args.learning_rate)
elif args.opt == 'radam':
    optimizer = RAdam(learning_rate=args.learning_rate)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')
test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

# declare training step
@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        predictions = model(images)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)

# declare test step
@tf.function
def test_step(images, labels):
    predictions = model(images)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)

for epoch in range(args.epochs):
    for images, labels in train_ds:
        train_step(images, labels)

    for test_images, test_labels in test_ds:
        test_step(test_images, test_labels)

    template = 'Epoch {}, Loss: {}, Accuracy: {}, Test Loss: {}, Test Accuracy: {}'
    print(template.format(epoch+1,
                        train_loss.result(),
                        train_accuracy.result()*100,
                        test_loss.result(),
                        test_accuracy.result()*100))

    # Reset the metrics for the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

I'm having the same experience. I'm using learning rate 0.001 and I'm getting infinity in summary histograms.

I also have this same issue, where it only works with small learning rates (0.001)

stale commented

This issue has been automatically marked as stale because it has not had recent activity. It will be closed if no further activity occurs. Thank you for your contributions.