How to train insgen with only 1 GPU

thanks for your sharing, but I only have 1 GPU ， it can not be trained

I see the reason why need multi-GPU is for ‘effect of disabling shuffle BN to MoCo’

but I can not understand why must shuffle batch data among all gpus not only in GPU？

would you provide a way to shuffle batch date on 1 GPU， it can be not ‘effect’ ？

I ran this on 2 gpus much slower than the baseline stylegan2, taking nearly twice as long.Then,I followed the solution in the issue1 and ran it on Colab. Again, it took twice as long.

If I apply this change I can run on a single GPU:

--- a/train.py
+++ b/train.py
@@ -413,7 +413,7 @@ def subprocess_fn(rank, args, temp_dir):
     dnnlib.util.Logger(file_name=os.path.join(args.run_dir, 'log.txt'), file_mode='a', should_flush=True)
 
     # Init torch.distributed.
-    if args.num_gpus > 1:
+    if args.num_gpus > 0:

The key to the above patch is that even with 1 GPU the following code needs to run to init process groups via the torch.distributed.init_process_group function

insgen/train.py

Lines 406 to 430 in 52bda7c

    
               # Init torch.distributed. 
        
               if args.num_gpus > 1: 
        
                   init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init')) 
        
                   if os.name == 'nt': 
        
                       init_method = 'file:///' + init_file.replace('\\', '/') 
        
                       torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus) 
        
                   else: 
        
                       init_method = f'file://{init_file}' 
        
                       torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus) 
        
               # Init torch_utils. 
        
               sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None 
        
               training_stats.init_multiprocessing(rank=rank, sync_device=sync_device) 
        
               if rank != 0: 
        
                   custom_ops.verbosity = 'none' 
        
               # Execute training loop. 
        
               training_loop.training_loop(rank=rank, **args) 
        
           #---------------------------------------------------------------------------- 
        
           class CommaSeparatedList(click.ParamType): 
        
               name = 'list' 
        
               def convert(self, value, param, ctx):

I changed the > condition for brevity.

If I apply this change I can run on a single GPU:
--- a/train.py
+++ b/train.py
@@ -413,7 +413,7 @@ def subprocess_fn(rank, args, temp_dir):
     dnnlib.util.Logger(file_name=os.path.join(args.run_dir, 'log.txt'), file_mode='a', should_flush=True)
 
     # Init torch.distributed.
-    if args.num_gpus > 1:
+    if args.num_gpus > 0:
The key to the above patch is that even with 1 GPU the following code needs to run to init process groups via the torch.distributed.init_process_group function

insgen/train.py

Lines 406 to 430 in 52bda7c

# Init torch.distributed.

if args.num_gpus > 1:

init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init'))

if os.name == 'nt':

init_method = 'file:///' + init_file.replace('\\', '/')

torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus)

else:

init_method = f'file://{init_file}'

torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus)

# Init torch_utils.

sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None

training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)

if rank != 0:

custom_ops.verbosity = 'none'

# Execute training loop.

training_loop.training_loop(rank=rank, **args)

#----------------------------------------------------------------------------

class CommaSeparatedList(click.ParamType):

name = 'list'

def convert(self, value, param, ctx):

I changed the > condition for brevity.

This method does not work on my mathine. Many different strange bugs prompted. I don't know why.

@jkla139 I actually used the copy from https://github.com/Zhendong-Wang/Diffusion-GAN . I cloned from the main branch of this repo and got:

:$ python train.py --gpus=1 ...
...
  File "/home/giles/projects/insgen/training/training_loop.py", line 407, in training_loop
    module = module.module
  File "/home/giles/projects/stylegan2-ada-pytorch/.env/stylegan2-ada-pytorch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 778, in __getattr__
    raise ModuleAttributeError("'{}' object has no attribute '{}'".format(
torch.nn.modules.module.ModuleAttributeError: 'CLHead' object has no attribute 'module'

I tracked this down to the following:

--- a/training/training_loop.py
+++ b/training/training_loop.py
@@ -403,8 +403,6 @@ def training_loop(
             snapshot_data = dict(training_set_kwargs=dict(training_set_kwargs))
             for name, module in [('G', G), ('D', D), ('G_ema', G_ema), ('augment_pipe', augment_pipe), ('D_ema', D_ema), ('DHead', DHead), ('GHead', GHead)]:
                 if module is not None:
-                    if name in ['DHead', 'GHead']:
-                        module = module.module
                     if num_gpus > 1:
                         misc.check_ddp_consistency(module, ignore_regex=r'.*\.w_avg')
                     module = copy.deepcopy(module).eval().requires_grad_(False).cpu()

diffusion-gan doesn't seem to have this, so I just removed those lines. It seems to be working.

@jkla139 I actually used the copy from https://github.com/Zhendong-Wang/Diffusion-GAN . I cloned from the main branch of this repo and got:

:$ python train.py --gpus=1 ...
...
  File "/home/giles/projects/insgen/training/training_loop.py", line 407, in training_loop
    module = module.module
  File "/home/giles/projects/stylegan2-ada-pytorch/.env/stylegan2-ada-pytorch/lib/python3.8/site-packages/torch/nn/modules/module.py", line 778, in __getattr__
    raise ModuleAttributeError("'{}' object has no attribute '{}'".format(
torch.nn.modules.module.ModuleAttributeError: 'CLHead' object has no attribute 'module'

I tracked this down to the following:

--- a/training/training_loop.py
+++ b/training/training_loop.py
@@ -403,8 +403,6 @@ def training_loop(
             snapshot_data = dict(training_set_kwargs=dict(training_set_kwargs))
             for name, module in [('G', G), ('D', D), ('G_ema', G_ema), ('augment_pipe', augment_pipe), ('D_ema', D_ema), ('DHead', DHead), ('GHead', GHead)]:
                 if module is not None:
-                    if name in ['DHead', 'GHead']:
-                        module = module.module
                     if num_gpus > 1:
                         misc.check_ddp_consistency(module, ignore_regex=r'.*\.w_avg')
                     module = copy.deepcopy(module).eval().requires_grad_(False).cpu()

diffusion-gan doesn't seem to have this, so I just removed those lines. It seems to be working.

Yes, these two lines need to be deleted, now it's work.

	# Init torch.distributed.
	if args.num_gpus > 1:
	init_file = os.path.abspath(os.path.join(temp_dir, '.torch_distributed_init'))
	if os.name == 'nt':
	init_method = 'file:///' + init_file.replace('\\', '/')
	torch.distributed.init_process_group(backend='gloo', init_method=init_method, rank=rank, world_size=args.num_gpus)
	else:
	init_method = f'file://{init_file}'
	torch.distributed.init_process_group(backend='nccl', init_method=init_method, rank=rank, world_size=args.num_gpus)

	# Init torch_utils.
	sync_device = torch.device('cuda', rank) if args.num_gpus > 1 else None
	training_stats.init_multiprocessing(rank=rank, sync_device=sync_device)
	if rank != 0:
	custom_ops.verbosity = 'none'

	# Execute training loop.
	training_loop.training_loop(rank=rank, **args)

	#----------------------------------------------------------------------------

	class CommaSeparatedList(click.ParamType):
	name = 'list'

	def convert(self, value, param, ctx):