NaN output and loss value
afmsaif opened this issue · 0 comments
afmsaif commented
I am using the following training function and librispeech dataset. Every time the output of the model while training become Nan as a result the loss is also nan. What could be the possible issue.
class IterMeter(object):
"""keeps track of total iterations"""
def init(self):
self.val = 0
def step(self):
self.val += 1
def get(self):
return self.val
def train(model, device, train_loader, criterion, optimizer, scheduler, epoch):
model.train()
train_loss = 0
data_len = len(train_loader.dataset)
for batch_idx, _data in enumerate(train_loader):
spectrograms, labels, input_lengths, label_lengths = _data
spectrograms=torch.squeeze(spectrograms, dim=1)
spectrograms = spectrograms.transpose(1,2)
labels= torch.LongTensor(labels.long())
input_lengths=torch.LongTensor(input_lengths)
label_lengths=torch.LongTensor(label_lengths)
input_lengths = input_lengths.to(device)
label_lengths = label_lengths.to(device)
spectrograms, labels = spectrograms.to(device), labels.to(device)
print(spectrograms.type())
optimizer.zero_grad()
output, output_lengths = model(spectrograms,input_lengths) # (batch, time, n_class)
output = output.transpose(0, 1) # (time, batch, n_class)
loss = criterion(output, labels, output_lengths, label_lengths)
train_loss += loss.item() / len(train_loader)
loss.backward()
optimizer.step()
scheduler.step()
if batch_idx % 100 == 0 or batch_idx == data_len:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(spectrograms), data_len,
100. * batch_idx / len(train_loader), loss.item()))
return train_loss
def test(model, device, test_loader, criterion, epoch,batch_size=20):
print('\nevaluating...')
model.eval()
test_loss = 0
test_cer, test_wer = [], []
n_classes = 29
if epoch%5==0:
with torch.no_grad():
for i, _data in enumerate(test_loader):
spectrograms, labels, input_lengths, label_lengths = _data
spectrograms=torch.squeeze(spectrograms)
spectrograms = spectrograms.transpose(1,2)
labels=labels.long()
input_lengths=torch.LongTensor(input_lengths)
label_lengths=torch.LongTensor(label_lengths)
input_lengths = input_lengths
label_lengths = label_lengths
spectrograms, labels = spectrograms.to(device), labels.to(device)
output, output_lengths = model(spectrograms,input_lengths) # (batch, time, n_class)
soft_max = torch.nn.functional.softmax(output,dim=2)
output = output.transpose(0, 1) # (time, batch, n_class)
loss = criterion(output, labels, output_lengths, label_lengths)
test_loss += loss.item() / len(test_loader)
decoder = CTCBeamDecoder(
[''] * (n_classes - 1) + [' '],
model_path=None,
alpha=0,
beta=0,
cutoff_top_n=40,
cutoff_prob=1.0,
beam_width=1000,
num_processes=4,
blank_id=28,
log_probs_input=False
)
beam_results, beam_scores, timesteps, out_lens = decoder.decode(soft_max, output_lengths)
b=[]
for i in range(batch_size):
b.append(beam_results[i][0][:out_lens[i][0]])
decoded_preds, decoded_targets = numtoword(b,out_lens,labels, label_lengths)
for j in range(len(decoded_preds)):
test_cer.append(cer(decoded_targets[j], decoded_preds[j]))
test_wer.append(wer(decoded_targets[j], decoded_preds[j]))
avg_cer = sum(test_cer)/len(test_cer)
avg_wer = sum(test_wer)/len(test_wer)
print('Test set: Average loss: {:.4f}, Average CER: {:4f} Average WER: {:.4f}\n'.format(test_loss, avg_cer, avg_wer))
return test_loss, avg_cer, avg_wer
else:
with torch.no_grad():
for i, _data in enumerate(test_loader):
spectrograms, labels, input_lengths, label_lengths = _data
spectrograms=torch.squeeze(spectrograms)
spectrograms = spectrograms.transpose(1,2)
labels=labels.long()
input_lengths=torch.LongTensor(input_lengths)
label_lengths=torch.LongTensor(label_lengths)
input_lengths = input_lengths.to(device)
label_lengths = label_lengths.to(device)
spectrograms, labels = spectrograms.to(device), labels.to(device)
output, output_lengths = model(spectrograms,input_lengths) # (batch, time, n_class)
soft_max = torch.nn.functional.softmax(output,dim=2)
output = output.transpose(0, 1) # (time, batch, n_class)
loss = criterion(output, labels, output_lengths, label_lengths)
test_loss += loss.item() / len(test_loader)
print('Test set: Average loss: {:.4f}\n'.format(test_loss))
return test_loss, 0 , 0
def main(learning_rate=5e-4, batch_size=20, epochs=10,
train_url="train-clean-100", test_url="test-clean"):
hparams = {
"n_class": 29,
"n_feats": 80,
"learning_rate": learning_rate,
"batch_size": batch_size,
"epochs": epochs
}
use_cuda = torch.cuda.is_available()
torch.manual_seed(7)
device = torch.device("cuda" if use_cuda else "cpu")
if not os.path.isdir("./data"):
os.makedirs("./data")
train_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=train_url, download=True)
test_dataset = torchaudio.datasets.LIBRISPEECH("./data", url=test_url, download=True)
kwargs = {'num_workers': 4, 'pin_memory': True} if use_cuda else {}
train_loader = data.DataLoader(dataset=train_dataset,
batch_size=hparams['batch_size'],
shuffle=True,
collate_fn=lambda x: data_processing(x, 'train'),
**kwargs)
test_loader = data.DataLoader(dataset=test_dataset,
batch_size=hparams['batch_size'],
shuffle=False,
collate_fn=lambda x: data_processing(x, 'valid'),
**kwargs)
model = Conformer(num_classes=hparams['n_class'],
input_dim=hparams['n_feats'],
encoder_dim=512,
num_encoder_layers=1)
model = nn.DataParallel(model)
model.to(device)
print('Num Model Parameters', sum([param.nelement() for param in model.parameters()]))
optimizer = optim.AdamW(model.parameters(), hparams['learning_rate'])
criterion = nn.CTCLoss().to(device)
scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=hparams['learning_rate'],
steps_per_epoch=int(len(train_loader)),
epochs=hparams['epochs'],
anneal_strategy='linear')
train_loss=[]
test_loss=[]
cer=[]
wer=[]
for epoch in range(1, epochs + 1):
tra_loss = train(model, device, train_loader, criterion, optimizer, scheduler, epoch)
tes_loss, c, w = test(model, device, test_loader, criterion, epoch)
train_loss.append(tra_loss)
test_loss.append(tes_loss)
cer.append(c)
wer.append(w)
return train_loss, test_loss, cer, wer