THUDM/CogView

CUDA Memory Error when finetuning the cogview-caption model

yukeyiniupi opened this issue · 2 comments

I run the model on a machine with 8 NVIDIA Tesla V100 GPU16G
Here is my script: cogview
#! /bin/bash

Change for multinode config

NUM_WORKERS=1
NUM_GPUS_PER_WORKER=8
MP_SIZE=1

script_path=$(realpath $0)
script_dir=$(dirname $script_path)
main_dir=$(dirname $script_dir)

OPTIONS_NCCL="NCCL_DEBUG=info NCCL_IB_DISABLE=0 NCCL_SOCKET_IFNAME=bond0 NCCL_IB_GID_INDEX=3 NCCL_NET_GDR_LEVEL=0"

OPTIONS_NCCL="NCCL_DEBUG=info"
HOST_FILE_PATH="hostfile_single"

config_json="$script_dir/ds_config_zero.json"
gpt_options="
--experiment-name cogview-caption
--img-tokenizer-num-tokens 8192
--dataset-type CompactBinaryDataset
--model-parallel-size ${MP_SIZE}
--num-layers 48
--hidden-size 2560
--num-attention-heads 40
--save $main_dir/data/checkpoints
--train-iters 200
--resume-dataloader
--train-data ./data/merge.bin
--split 949,50,1
--distributed-backend nccl
--lr-decay-style constant
--warmup .1
--load pretrained/cogview/cogview-caption/
--finetune
--checkpoint-activations
--deepspeed-activation-checkpointing
--max-position-embeddings 1089
--max-memory-length 0
--fp16
--txt-loss-scale 5
"

gpt_options="${gpt_options}
--deepspeed
--deepspeed_config ${config_json}
"

run_cmd="${OPTIONS_NCCL} deepspeed --num_nodes ${NUM_WORKERS} --num_gpus ${NUM_GPUS_PER_WORKER} --hostfile ${HOST_FILE_PATH} pretrain_gpt2.py $@ ${gpt_options}"
echo ${run_cmd}
eval ${run_cmd}

set +x

deepspeed:
{
"train_micro_batch_size_per_gpu": 1,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"gradient_clipping": 0.1,
"zero_optimization": {
"stage":2,
"cpu_offload": false,
"contiguous_gradients": false,
"overlap_comm": true,
"reduce_scatter": true,
"reduce_bucket_size": 100000000,
"allgather_bucket_size": 1000000000
},
"zero_allow_untested_optimizer": true,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 400,
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00005,
"betas": [
0.9,
0.95
],
"eps": 1e-8,
"weight_decay": 4e-2
}
},
"activation_checkpointing": {
"partition_activations": false,
"contiguous_memory_optimization": false
},
"wall_clock_breakdown": false
}
@Sleepychord

你的bin文件怎么生成的啊,我四个16g v100 也爆了

@dh12306 可以加我下qq咱们私聊下1362212294

Hello, would you please tell me how to organize the raw text-image dataset and then how to use the cogdata toolkit to generate the target bin file?