Keep-rate scheduling of DropBlock in a multi-GPU environment
kyleahn opened this issue · 2 comments
Hello,
I found an issue while trying to train your model.
In your code, the variable 'self.num_batches_tracked' should count the progress of the episode by increasing when the model is called.
But in the multi-GPU environment, the modification of the variable in the forward() is ignored because a DataParallel replicates the model into each GPU and the updates are destroyed after forward(). So the variable just moves up and down with 0 and 1.
I think this should be fixed. Thanks :)
Thank you for pointing this out. I will try to fix this problem.
This effectively deactivates dropout when using multiple GPUs. State variables that are modified in the forward pass need to be registered as buffer. Here is the fix:
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None, drop_rate=0.0, drop_block=False,
block_size=1, use_se=False):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes)
self.bn1 = nn.BatchNorm2d(planes)
self.relu = nn.LeakyReLU(0.1)
self.conv2 = conv3x3(planes, planes)
self.bn2 = nn.BatchNorm2d(planes)
self.conv3 = conv3x3(planes, planes)
self.bn3 = nn.BatchNorm2d(planes)
self.maxpool = nn.MaxPool2d(stride)
self.downsample = downsample
self.stride = stride
self.drop_rate = drop_rate
#self.num_batches_tracked = 0 # DataParallel will only keep changes that were made to registered buffers
self.register_buffer('num_batches_tracked', torch.tensor(0, dtype=torch.long))
self.drop_block = drop_block
self.block_size = block_size
self.DropBlock = DropBlock(block_size=self.block_size)
self.use_se = use_se
if self.use_se:
self.se = SELayer(planes, 4)
def forward(self, x):
self.num_batches_tracked += 1
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.use_se:
out = self.se(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
out = self.maxpool(out)
if self.drop_rate > 0:
if self.drop_block == True:
feat_size = out.size()[2]
keep_rate = max(1.0 - self.drop_rate / (20*2000) * (self.num_batches_tracked.item()), 1.0 - self.drop_rate)
gamma = (1 - keep_rate) / self.block_size**2 * feat_size**2 / (feat_size - self.block_size + 1)**2
out = self.DropBlock(out, gamma=gamma)
else:
out = F.dropout(out, p=self.drop_rate, training=self.training, inplace=True)
return out