Training Scrip Error: AttributeError: 'ChameleonRMSNorm' object has no attribute 'reset_parameters'. Did you mean: 'get_parameter'?
Closed this issue · 5 comments
[rank2]: Traceback (most recent call last):
[rank2]: File "/mnt/Lumina-mGPT/lumina_mgpt/finetune_solver.py", line 113, in <module>
[rank2]: solver = Solver(args)
[rank2]: File "/mnt/Lumina-mGPT/xllmx/solvers/finetune/finetune.py", line 97, in __init__
[rank2]: self.model, self.tokenizer, self.optimizer = self.build_model()
[rank2]: File "/mnt/hwfile/mllm/sunzeyi/Lumina-mGPT/xllmx/solvers/finetune/finetune.py", line 335, in build_model
[rank2]: model = self.setup_fsdp_sync(
[rank2]: File "/mnt/Lumina-mGPT/xllmx/solvers/finetune/finetune.py", line 373, in setup_fsdp_sync
[rank2]: model = FSDP(
[rank2]: File "/mnt/miniconda3/envs/lumina_mgpt/lib/python3.10/site-packages/torch/distributed/fsdp/fully_sharded_data_parallel.py", line 485, in __init__
[rank2]: _auto_wrap(
[rank2]: File "/mnt/miniconda3/envs/lumina_mgpt/lib/python3.10/site-packages/torch/distributed/fsdp/_wrap_utils.py", line 101, in _auto_wrap
[rank2]: _recursive_wrap(**recursive_wrap_kwargs, **root_kwargs) # type: ignore[arg-type]
[rank2]: AttributeError: 'ChameleonRMSNorm' object has no attribute 'reset_parameters'. Did you mean: 'get_parameter'?
Please try changing setup_fsdp_sync to the following. We will push a fix very soon.
def setup_fsdp_sync(self, model: nn.Module, data_parallel: str, precision: str, grad_precision: Optional[str]) -> FSDP:
if self.dp_rank == 0:
param_init_fn = None
else:
param_init_fn = lambda x: x.to_empty(device=torch.cuda.current_device(), recurse=False)
model = FSDP(
model,
auto_wrap_policy=functools.partial(
lambda_auto_wrap_policy,
lambda_fn=lambda m: m in model.get_fsdp_wrap_module_list(),
),
process_group=fs_init.get_data_parallel_group(),
sharding_strategy={
"fsdp": ShardingStrategy.FULL_SHARD,
"sdp": ShardingStrategy.SHARD_GRAD_OP,
}[data_parallel],
mixed_precision=MixedPrecision(
param_dtype={
"fp32": torch.float,
"tf32": torch.float,
"bf16": torch.bfloat16,
"fp16": torch.float16,
}[precision],
reduce_dtype={
"fp32": torch.float,
"tf32": torch.float,
"bf16": torch.bfloat16,
"fp16": torch.float16,
}[grad_precision or precision],
),
device_id=torch.cuda.current_device(),
sync_module_states=True,
limit_all_gathers=True,
use_orig_params=True,
param_init_fn=param_init_fn
)
torch.cuda.synchronize()
return model
thanks!
Please try changing setup_fsdp_sync to the following. We will push a fix very soon.
def setup_fsdp_sync(self, model: nn.Module, data_parallel: str, precision: str, grad_precision: Optional[str]) -> FSDP: if self.dp_rank == 0: param_init_fn = None else: param_init_fn = lambda x: x.to_empty(device=torch.cuda.current_device(), recurse=False) model = FSDP( model, auto_wrap_policy=functools.partial( lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.get_fsdp_wrap_module_list(), ), process_group=fs_init.get_data_parallel_group(), sharding_strategy={ "fsdp": ShardingStrategy.FULL_SHARD, "sdp": ShardingStrategy.SHARD_GRAD_OP, }[data_parallel], mixed_precision=MixedPrecision( param_dtype={ "fp32": torch.float, "tf32": torch.float, "bf16": torch.bfloat16, "fp16": torch.float16, }[precision], reduce_dtype={ "fp32": torch.float, "tf32": torch.float, "bf16": torch.bfloat16, "fp16": torch.float16, }[grad_precision or precision], ), device_id=torch.cuda.current_device(), sync_module_states=True, limit_all_gathers=True, use_orig_params=True, param_init_fn=param_init_fn ) torch.cuda.synchronize() return model
Thank you, I tried this solution, but the problem still exists, any other solutions?
Please try changing setup_fsdp_sync to the following. We will push a fix very soon.
def setup_fsdp_sync(self, model: nn.Module, data_parallel: str, precision: str, grad_precision: Optional[str]) -> FSDP: if self.dp_rank == 0: param_init_fn = None else: param_init_fn = lambda x: x.to_empty(device=torch.cuda.current_device(), recurse=False) model = FSDP( model, auto_wrap_policy=functools.partial( lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.get_fsdp_wrap_module_list(), ), process_group=fs_init.get_data_parallel_group(), sharding_strategy={ "fsdp": ShardingStrategy.FULL_SHARD, "sdp": ShardingStrategy.SHARD_GRAD_OP, }[data_parallel], mixed_precision=MixedPrecision( param_dtype={ "fp32": torch.float, "tf32": torch.float, "bf16": torch.bfloat16, "fp16": torch.float16, }[precision], reduce_dtype={ "fp32": torch.float, "tf32": torch.float, "bf16": torch.bfloat16, "fp16": torch.float16, }[grad_precision or precision], ), device_id=torch.cuda.current_device(), sync_module_states=True, limit_all_gathers=True, use_orig_params=True, param_init_fn=param_init_fn ) torch.cuda.synchronize() return modelThank you, I tried this solution, but the problem still exists, any other solutions?
It works, thank you for helping, make sure model_parallel_size=1
Please try changing setup_fsdp_sync to the following. We will push a fix very soon.
def setup_fsdp_sync(self, model: nn.Module, data_parallel: str, precision: str, grad_precision: Optional[str]) -> FSDP: if self.dp_rank == 0: param_init_fn = None else: param_init_fn = lambda x: x.to_empty(device=torch.cuda.current_device(), recurse=False) model = FSDP( model, auto_wrap_policy=functools.partial( lambda_auto_wrap_policy, lambda_fn=lambda m: m in model.get_fsdp_wrap_module_list(), ), process_group=fs_init.get_data_parallel_group(), sharding_strategy={ "fsdp": ShardingStrategy.FULL_SHARD, "sdp": ShardingStrategy.SHARD_GRAD_OP, }[data_parallel], mixed_precision=MixedPrecision( param_dtype={ "fp32": torch.float, "tf32": torch.float, "bf16": torch.bfloat16, "fp16": torch.float16, }[precision], reduce_dtype={ "fp32": torch.float, "tf32": torch.float, "bf16": torch.bfloat16, "fp16": torch.float16, }[grad_precision or precision], ), device_id=torch.cuda.current_device(), sync_module_states=True, limit_all_gathers=True, use_orig_params=True, param_init_fn=param_init_fn ) torch.cuda.synchronize() return modelThank you, I tried this solution, but the problem still exists, any other solutions?
It works, thank you for helping, make sure model_parallel_size=1
But in this way, the model will not chunk into multiple gpus?