locuslab/TCN

DDP training with TCN Model

asifmustafa87 opened this issue · 0 comments

Hi,

I am using tcn model for time series forecasting. My model looks like this:

class Chomp1d(nn.Module):
        def __init__(self, chomp_size):
            super(Chomp1d, self).__init__()
            self.chomp_size = chomp_size

        def forward(self, x):
            return x[:, :, :-self.chomp_size].contiguous()

    class TemporalBlock(nn.Module):
        def __init__(self, n_inputs, n_outputs, kernel_size, stride, dilation, padding, dropout=0.2):
            super(TemporalBlock, self).__init__()
            self.conv1 = nn.Conv1d(n_inputs, n_outputs, kernel_size,
                                   stride=stride, padding=padding, dilation=dilation)
            self.chomp1 = Chomp1d(padding)
            self.relu1 = nn.ReLU()
            self.dropout1 = nn.Dropout(dropout)

            self.conv2 = nn.Conv1d(n_outputs, n_outputs, kernel_size,
                                   stride=stride, padding=padding, dilation=dilation)
            self.chomp2 = Chomp1d(padding)
            self.relu2 = nn.ReLU()
            self.dropout2 = nn.Dropout(dropout)

            self.net = nn.Sequential(self.conv1, self.chomp1, self.relu1, self.dropout1,
                                     self.conv2, self.chomp2, self.relu2, self.dropout2)
            self.downsample = nn.Conv1d(n_inputs, n_outputs, 1) if n_inputs != n_outputs else None
            self.relu = nn.ReLU()
            self.init_weights()

        def init_weights(self):
            self.conv1.weight.data.normal_(0, 0.01)
            self.conv2.weight.data.normal_(0, 0.01)
            if self.downsample is not None:
                self.downsample.weight.data.normal_(0, 0.01)

        def forward(self, x):
            out = self.net(x)
            res = x if self.downsample is None else self.downsample(x)
            return self.relu(out + res)

    class TCN(nn.Module):
        def __init__(self, input_size, output_size, num_channels, kernel_size, dropout):
            super(TCN, self).__init__()
            self.tcn = nn.Sequential(
                *[TemporalBlock(input_size if i == 0 else num_channels[i - 1],
                                num_channels[i],
                                kernel_size=kernel_size,
                                stride=1,
                                dilation=2 ** i,
                                padding=(kernel_size - 1) * 2 ** i,
                                dropout=dropout) for i in range(len(num_channels))]
            )
            self.fc = nn.Linear(num_channels[-1], 64)  # add a new fully connected layer
            self.linear = nn.Linear(64, output_size)  # last linear layer

        def forward(self, x):
            y = self.tcn(x)
            y = y[:, :, -1]  # last element of each sequence
            y = self.fc(y)  # pass through the new fully connected layer
            y = self.linear(y)  # last linear layer
            return y

Now I am trying to find hyperparameters of num_channels, kernel_size, dropout, learning_rate, batch_size, etc using Optuna and DDP of fabric lightening. I am using 2 GPUs.But I am getting some errors:
RuntimeError: DDP expects the same model across all ranks, but Rank 1 has 22 params, while rank 0 has inconsistent 18 params.

What is going on here? Any suggestions please?