[Chatllama] how to reduce the CUDA memory comsumption a llama7B model

Question

[Chatllama] how to reduce the CUDA memory comsumption a llama7B model

balcklive opened this issue 2 years ago · 0 comments

As I metioned in this Issue: #314
I am training a llama 7B model with 8 v100, totally I got 32 *8 G GPU memory. But at last, you can tell from the detail, I finally got a CUDA out of memory, fault.
Can anybody tell me Is there any other way that I can reduce my model memory comsuption on GPU?
my deepspeed config.json file is as below:
{

"gradient_accumulation_steps": 1,
"optimizer": {
  "type": "Adam",
  "params": {
    "lr": 0.00015
  }},

"zero_force_ds_cpu_optimizer": false,
"zero_optimization": {
"stage": 3,
"contiguous_gradients": true,
"stage3_max_live_parameters": 0,
"stage3_max_reuse_distance": 0,
"stage3_prefetch_bucket_size": 0,
"stage3_param_persistence_threshold": 1e2,
"reduce_bucket_size": 1e2,
"sub_group_size": 1e8,
"offload_optimizer": {
"device": "cpu",
"pin_memory": true
},
"offload_param": {
"device": "cpu",
"pin_memory": true
},
"stage3_gather_16bit_weights_on_model_save": true
},
"fp16": {
"enabled": true,
"auto_cast": false,
"loss_scale": 0,
"initial_scale_power": 32,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"train_batch_size": 8,
"train_micro_batch_size_per_gpu": 1,
"wall_clock_breakdown": false
}