markovmodel/deeptime

vamp._loss_VAMP_sym error

euhruska opened this issue · 4 comments

with vamp._loss_VAMP_sym
in line

hist = model.fit_generator(generator = vamp_data_loader.build_generator_on_source(train_data_source,
                                                      batch_size,
                                                      tau,
                                                      output_size),
                           steps_per_epoch = steps_per_train_epoch,
                           epochs = nb_epoch,
                           verbose = 0,
                           validation_data = vamp_data_loader.build_generator_on_source(valid_data_source,
                                                            batch_size,
                                                            tau,
                                                            output_size),
                           validation_steps = steps_per_valid_epoch,
                           shuffle = True
                          )

I get an fatal error:

---------------------------------------------------------------------------
InvalidArgumentError                      Traceback (most recent call last)
<ipython-input-54-480bb4a81ec5> in <module>()
     28                                                                     output_size),
     29                                    validation_steps = steps_per_valid_epoch,
---> 30                                    shuffle = True
     31                                   )
     32 

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name +
     90                               '` call to the Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training.py in fit_generator(self, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
   1413             use_multiprocessing=use_multiprocessing,
   1414             shuffle=shuffle,
-> 1415             initial_epoch=initial_epoch)
   1416 
   1417     @interfaces.legacy_generator_methods_support

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training_generator.py in fit_generator(model, generator, steps_per_epoch, epochs, verbose, callbacks, validation_data, validation_steps, class_weight, max_queue_size, workers, use_multiprocessing, shuffle, initial_epoch)
    228                             val_enqueuer_gen,
    229                             validation_steps,
--> 230                             workers=0)
    231                     else:
    232                         # No need for try/except because

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/legacy/interfaces.py in wrapper(*args, **kwargs)
     89                 warnings.warn('Update your `' + object_name +
     90                               '` call to the Keras 2 API: ' + signature, stacklevel=2)
---> 91             return func(*args, **kwargs)
     92         wrapper._original_function = func
     93         return wrapper

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training.py in evaluate_generator(self, generator, steps, max_queue_size, workers, use_multiprocessing, verbose)
   1467             workers=workers,
   1468             use_multiprocessing=use_multiprocessing,
-> 1469             verbose=verbose)
   1470 
   1471     @interfaces.legacy_generator_methods_support

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training_generator.py in evaluate_generator(model, generator, steps, max_queue_size, workers, use_multiprocessing, verbose)
    341                                  'or (x, y). Found: ' +
    342                                  str(generator_output))
--> 343             outs = model.test_on_batch(x, y, sample_weight=sample_weight)
    344             outs = to_list(outs)
    345             outs_per_batch.append(outs)

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/engine/training.py in test_on_batch(self, x, y, sample_weight)
   1252             ins = x + y + sample_weights
   1253         self._make_test_function()
-> 1254         outputs = self.test_function(ins)
   1255         return unpack_singleton(outputs)
   1256 

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in __call__(self, inputs)
   2664                 return self._legacy_call(inputs)
   2665 
-> 2666             return self._call(inputs)
   2667         else:
   2668             if py_any(is_tensor(x) for x in inputs):

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py in _call(self, inputs)
   2634                                 symbol_vals,
   2635                                 session)
-> 2636         fetched = self._callable_fn(*array_vals)
   2637         return fetched[:len(self.outputs)]
   2638 

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/tensorflow/python/client/session.py in __call__(self, *args, **kwargs)
   1380           ret = tf_session.TF_SessionRunCallable(
   1381               self._session._session, self._handle, args, status,
-> 1382               run_metadata_ptr)
   1383         if run_metadata:
   1384           proto_data = tf_session.TF_GetBuffer(run_metadata_ptr)

/scratch1/eh22/conda/envs/extasy13/lib/python3.6/site-packages/tensorflow/python/framework/errors_impl.py in __exit__(self, type_arg, value_arg, traceback_arg)
    517             None, None,
    518             compat.as_text(c_api.TF_Message(self.status.status)),
--> 519             c_api.TF_GetCode(self.status.status))
    520     # Delete the underlying status object from memory otherwise it stays alive
    521     # as there is a reference to status from this from the traceback due to

InvalidArgumentError: Got info = 2 for batch index 0, expected info = 0. Debug_info = heevd
   [[Node: metrics_4/metric_VAMP/SelfAdjointEigV2 = SelfAdjointEigV2[T=DT_FLOAT, compute_v=true, _device="/job:localhost/replica:0/task:0/device:GPU:0"](loss_4/concatenate_1_loss/mul_3)]]
   [[Node: loss_4/mul/_603 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_450_loss_4/mul", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]

I don't get this error, which tensorflow version are you using? This was an issue appearing sometimes on tf 1.7-1.8

tf version 1.10.0

For us this is working with this version of tensorflow. We are unable to reproduce this. Are you working on a GPU? (Perhaps also the cuda version plays a role...)
Can you write us your Cuda version and which Cudnn you are using if it applies?
Thx
Andreas

It might also be that it failed to compute the SVD for the first batch so there could be a problem with your data. The documentation of heevd (which is a eigenvalue/eigenvector solver) says:

if INFO = i and JOBZ = 'N', then the algorithm failed
                to converge; i off-diagonal elements of an intermediate
                tridiagonal form did not converge to zero;
if INFO = i and JOBZ = 'V', then the algorithm failed
                to compute an eigenvalue while working on the submatrix
                lying in rows and columns INFO/(N+1) through
                mod(INFO,N+1).