num_workers affect train step num

Question

num_workers affect train step num

SunnyGhj opened this issue a year ago · 1 comments

class Data:
    def __init__(self, config: DataConfig):
        self.config = config

    def read_and_decode(self, batch):
        raw_labels = np.array([raw['labels'] for raw in batch])
        raw_feats = np.array([raw['features'] for raw in batch])
        click_label = torch.as_tensor(data=raw_labels[:, 1], dtype=torch.float)
        add_label = torch.as_tensor(data=raw_labels[:, 2], dtype=torch.float)
        order_label = torch.as_tensor(data=raw_labels[:, 3], dtype=torch.float)
        labels = {'click_label': click_label, 'add_label': add_label, 'order_label': order_label}
        offset, features = 0, {}
        fea_config = self.config.fea_config
        for feat in fea_config['features']:
            feat_name = feat['prefix']
            feat_size = feat['size']
            features[feat_name] = torch.as_tensor(data=raw_feats[:, offset:offset + feat_size], dtype=torch.int)
            offset += feat_size
        return features, labels

    def data_input_fn_torch(self):
        splits = {a: 1 for a in self.config.data_file}
        print(splits)
        record = MultiTFRecordDataset(data_pattern='{}',
                                      compression_type='gzip',
                                      splits=splits,
                                      index_pattern=None,
                                      shuffle_queue_size=self.config.shuffle_pool,
                                      infinite=False)
        dataloader = DataLoader(dataset=record,
                                # shuffle=True,
                                num_workers=16,
                                prefetch_factor=2,
                                pin_memory=True,
                                batch_size=self.config.batch_size,
                                collate_fn=self.read_and_decode,
                                drop_last=True,
                                pin_memory_device='cuda:0')
        return dataloader

The larger the number of num workers, the more steps, and the number of steps is proportional to the number of num workers, which is obviously wrong

Answer 1 · 2023-07-11T01:17:56.000Z

It's recommended to create an index file for each TFRecord file. Index file must be provided when using multiple workers, otherwise the loader may return duplicate records.