ValueError: Input contains NaN, infinity or a value too large for dtype('float32') when training on the 'burgers_delta' dataset
Closed this issue · 2 comments
psaegert commented
Hi,
I am trying to reproduce your results using the 'burgers_delta' dataset. However, I often get the following error when the training begins:
Using device: cuda
Dataset is using device: cuda
2021-08-17 13:22:26.965021: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
0 MSE: nan Reg: 1.52e-05 L1: 1.83e+01 Traceback (most recent call last):
File "replicate_burgers_troubleshoot.py", line 96, in <module>
train(
File "/home/paulsaegert/.local/lib/python3.8/site-packages/deepymod/training/training.py", line 100, in train
_ = model.sparse_estimator(
File "/home/paulsaegert/.local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/home/paulsaegert/.local/lib/python3.8/site-packages/deepymod/model/deepmod.py", line 141, in forward
self.coeff_vectors = [
File "/home/paulsaegert/.local/lib/python3.8/site-packages/deepymod/model/deepmod.py", line 142, in <listcomp>
self.fit(theta, time_deriv.squeeze())[:, None]
File "/home/paulsaegert/.local/lib/python3.8/site-packages/deepymod/model/sparse_estimators.py", line 77, in fit
coeffs = self.estimator.fit(X, y).coef_
File "/home/paulsaegert/.local/lib/python3.8/site-packages/sklearn/linear_model/_coordinate_descent.py", line 1216, in fit
X, y = self._validate_data(X, y,
File "/home/paulsaegert/.local/lib/python3.8/site-packages/sklearn/base.py", line 430, in _validate_data
X = check_array(X, **check_X_params)
File "/home/paulsaegert/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "/home/paulsaegert/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 720, in check_array
_assert_all_finite(array,
File "/home/paulsaegert/.local/lib/python3.8/site-packages/sklearn/utils/validation.py", line 103, in _assert_all_finite
raise ValueError(
ValueError: Input contains NaN, infinity or a value too large for dtype('float32').
This is the python script I used:
import os
import torch
from deepymod import DeepMoD
from deepymod.data import Dataset, get_train_test_loader
from deepymod.data.burgers import burgers_delta
from deepymod.data.samples import Subsample_random
from deepymod.model.func_approx import NN
from deepymod.model.library import Library1D
from deepymod.model.constraint import LeastSquares
from deepymod.model.sparse_estimators import Threshold
from deepymod.training import train
from deepymod.training.sparsity_scheduler import TrainTestPeriodic
import time
import shutil
if torch.cuda.is_available() and True:
device = "cuda"
else:
device = "cpu"
print(f'Using device: {device}')
temp_log_dir = 'temp_log'
if os.path.exists(temp_log_dir): shutil.rmtree(temp_log_dir)
if os.path.exists(f'{temp_log_dir}checkpoint.pt'): os.remove(f'{temp_log_dir}checkpoint.pt')
if os.path.exists(f'{temp_log_dir}model.pt'): os.remove(f'{temp_log_dir}model.pt')
result = {
'true_coefficients': (0.1, 1.0)
}
parameters = {
'x_min': -8,
'x_max': 8,
't_min': 0,
't_max': 10,
'n_samples': 100,
'x_resolution': 256,
't_resolution': 101,
'noise_level': 0.01,
'train_size': None,
'network_topology': (2, [50, 50, 50, 50], 1),
'lib_poly_order': 2,
'lib_diff_order': 3,
'sparse_threshold': 0.1,
'sparsity_scheduler_periodicity': 50,
'sparsity_scheduler_patience': 200,
'sparsity_scheduler_delta': 1e-5,
'write_iterations': 25,
'max_iterations': 50_000,
'converge_patience': 200,
'converge_delta': 1e-4,
'optimizer_learning_rate': 2e-3,
'optimizer_beta1': 0.99,
'optimizer_beta2': 0.99,
'optimizer_amsgrad': True,
}
v, A = result['true_coefficients']
x = torch.linspace(parameters['x_min'], parameters['x_max'], parameters['x_resolution'])
t = torch.linspace(parameters['t_min'], parameters['t_max'], parameters['t_resolution'])
load_kwargs = {"x": x, "t": t, "v": v, "A": A}
preprocess_kwargs = {"noise_level": parameters['noise_level']}
dataset = Dataset(
burgers_delta,
load_kwargs=load_kwargs,
preprocess_kwargs=preprocess_kwargs,
subsampler=Subsample_random,
subsampler_kwargs={"number_of_samples": parameters['n_samples']},
device=device,
)
train_dataloader, test_dataloader = get_train_test_loader(
dataset, train_test_split=0.8
)
parameters['train_size'] = len(train_dataloader[0][0])
network = NN(*parameters['network_topology'])
library = Library1D(poly_order=parameters['lib_poly_order'], diff_order=parameters['lib_diff_order'])
sparsity_estimator = Threshold(parameters['sparse_threshold'])
sparsity_scheduler = TrainTestPeriodic(periodicity=parameters['sparsity_scheduler_periodicity'], patience=parameters['sparsity_scheduler_patience'], delta=parameters['sparsity_scheduler_delta'])
constraint = LeastSquares()
model = DeepMoD(network, library, sparsity_estimator, constraint).to(device)
optimizer = torch.optim.Adam(model.parameters(), betas=(parameters['optimizer_beta1'], parameters['optimizer_beta2']), amsgrad=parameters['optimizer_amsgrad'], lr=parameters['optimizer_learning_rate'])
start_time = time.time()
train(
model,
train_dataloader,
test_dataloader,
optimizer,
sparsity_scheduler,
exp_ID="0.0",
log_dir=temp_log_dir,
write_iterations=parameters['write_iterations'],
max_iterations=parameters['max_iterations'],
delta=parameters['converge_delta'],
patience=parameters['converge_patience'],
)
shutil.rmtree(temp_log_dir)
It seems like setting x_min
and x_max
to values closer to 0 helps but I would like to specifically test -8 to 8 as in your paper.
GJBoth commented
The issue here isn't in the x, but in the t. The initial condition is a delta peak, so the solution goes to infinity at t=0
- setting t_min=0.1
or something similar should fix your issues. Let me know if it doesn't!
psaegert commented
Setting t_min=0.1
fixed the issue. Thank you very much!