在预训练Pretrain时报no valid `self._rcvd_idx` is found错误
yt7589 opened this issue · 3 comments
yt7589 commented
我根据Pretrain部分,尝试进行预训练,首先准备一个数据集文本文件,放在./datasets/zjkj/zjkj_wd.jsonl文件中,内容为:
{"id": 1, "dataType": "百科", "title": "转睛科技的业务", "content": "上海转睛科技有限公司,是一家以视频分析为核心、专业从事自动驾驶车路协同及其关联技术的公司。"}
然后在data_utils/corpora.py中,定义新的语料库:
class ZjkjCorpus(PromptReader):
PATH = "./datasets/zjkj"
is_json = False
reserve_punct = True
split_row = True
def process_line(self, item, tokenizer, tokenize):
item = json.loads(item)
prompts, texts = [], []
text = ""
title = item.get("title", None)
content = item.get("content", None)
if title:
text += title.strip() + " "
if content:
text += content
if len(text) > 100:
prompt, text = self.process_sample("", tokenizer, tokenize), self.process_sample(text, tokenizer,
tokenize)
prompts.append(prompt)
texts.append(text)
return prompts, texts
NAMED_CORPORA = {
......,
'zjkj': ZjkjCorpus
}
使用bash scripts/ds_pretrain_nvidia.sh config/ds_block_large.sh 启动预训练可以正常启动,但是运行了200个epoch之后(我的数据集只有18个条目),会在从数据集中获取批次数据时报错,当args.iterations=250时会报如下错误:
│ /mnt/10t/yantao/adev/a00/GLM/pretrain_glm.py:682 in <module> │
│ │
│ 679 │
│ 680 │
│ 681 if __name__ == "__main__": │
│ ❱ 682 │ main() │
│ 683 │
│ │
│ /mnt/10t/yantao/adev/a00/GLM/pretrain_glm.py:655 in main │
│ │
│ 652 │ │ │ │ │ save_checkpoint(args_.iteration, model_, optimizer_, lr_scheduler_, │
│ 653 │ │ │ │ │
│ 654 │ │ │ │ # stack.callback(save_on_exit, args, model, optimizer, lr_scheduler) │
│ ❱ 655 │ │ │ │ iteration, skipped = train(model, optimizer, │
│ 656 │ │ │ │ │ │ │ │ │ │ lr_scheduler, │
│ 657 │ │ │ │ │ │ │ │ │ │ (train_data_iterator, multi_train_iterator), │
│ 658 │ │ │ │ │ │ │ │ │ │ (val_data_iterator, multi_val_iterator), │
│ │
│ /mnt/10t/yantao/adev/a00/GLM/pretrain_glm.py:340 in train │
│ │
│ 337 │ mems = [] │
│ 338 │ while args.iteration < args.train_iters: │
│ 339 │ │ │
│ ❱ 340 │ │ lm_loss, skipped_iter, mems = train_step(train_data_iterator, │
│ 341 │ │ │ │ │ │ │ │ │ │ │ │ model, │
│ 342 │ │ │ │ │ │ │ │ │ │ │ │ optimizer, │
│ 343 │ │ │ │ │ │ │ │ │ │ │ │ lr_scheduler, │
│ │
│ /mnt/10t/yantao/adev/a00/GLM/train_utils.py:335 in train_step │
│ │
│ 332 │ │ skipped_iter, complete = 0, False │
│ 333 │ │ # Forward model for one step. │
│ 334 │ │ timers('forward').start() │
│ ❱ 335 │ │ lm_loss, mems, _ = forward_step_func(data_iterator, model, args, timers, mems) │
│ 336 │ │ timers('forward').stop() │
│ 337 │ │ # print_rank_0("Forward step") │
│ 338 │ │ if not args.deepspeed: │
│ │
│ /mnt/10t/yantao/adev/a00/GLM/pretrain_glm.py:222 in forward_step │
│ │
│ 219 │ │ data = next(data_iterator[1]) if data_iterator[1] else None │
│ 220 │ │ data["mode"] = "multi-task" │
│ 221 │ else: │
│ ❱ 222 │ │ data = next(data_iterator[0]) if data_iterator[0] else None │
│ 223 │ # print_rank_0("data iterator") │
│ 224 │ timers('data loader').stop() │
│ 225 │ tokens, labels, loss_mask, attention_mask, position_ids = get_batch(data, args) │
│ │
│ /mnt/10t/yantao/software/anaconda3/envs/ssp/lib/python3.8/site-packages/torch/utils/data/dataloa │
│ der.py:681 in __next__ │
│ │
│ 678 │ │ │ if self._sampler_iter is None: │
│ 679 │ │ │ │ # TODO(https://github.com/pytorch/pytorch/issues/76750) │
│ 680 │ │ │ │ self._reset() # type: ignore[call-arg] │
│ ❱ 681 │ │ │ data = self._next_data() │
│ 682 │ │ │ self._num_yielded += 1 │
│ 683 │ │ │ if self._dataset_kind == _DatasetKind.Iterable and \ │
│ 684 │ │ │ │ │ self._IterableDataset_len_called is not None and \ │
│ │
│ /mnt/10t/yantao/software/anaconda3/envs/ssp/lib/python3.8/site-packages/torch/utils/data/dataloa │
│ der.py:1349 in _next_data │
│ │
│ 1346 │ │ │ │ # no valid `self._rcvd_idx` is found (i.e., didn't break) │
│ 1347 │ │ │ │ if not self._persistent_workers: │
│ 1348 │ │ │ │ │ self._shutdown_workers() │
│ ❱ 1349 │ │ │ │ raise StopIteration │
│ 1350 │ │ │ │
│ 1351 │ │ │ # Now `self._rcvd_idx` is the batch index we want to fetch │
│ 1352
我的deepspeed版本为0.9.0,pytorch的版本为1.12,GLM采用的是master分支的最新版。参数配置方面:NUM_WORKERS=1,NUM_GPUS_PER_WORKER=1,MP_SIZE=1 。
请问这是什么原因造成的,怎么解决?
1160300419 commented
我也遇到了一样的问题。
1160300419 commented
chenbintao-tal commented
+1