vahidk/tfrecord

num_workers affect train step num

SunnyGhj opened this issue · 1 comments

class Data:
    def __init__(self, config: DataConfig):
        self.config = config

    def read_and_decode(self, batch):
        raw_labels = np.array([raw['labels'] for raw in batch])
        raw_feats = np.array([raw['features'] for raw in batch])
        click_label = torch.as_tensor(data=raw_labels[:, 1], dtype=torch.float)
        add_label = torch.as_tensor(data=raw_labels[:, 2], dtype=torch.float)
        order_label = torch.as_tensor(data=raw_labels[:, 3], dtype=torch.float)
        labels = {'click_label': click_label, 'add_label': add_label, 'order_label': order_label}
        offset, features = 0, {}
        fea_config = self.config.fea_config
        for feat in fea_config['features']:
            feat_name = feat['prefix']
            feat_size = feat['size']
            features[feat_name] = torch.as_tensor(data=raw_feats[:, offset:offset + feat_size], dtype=torch.int)
            offset += feat_size
        return features, labels

    def data_input_fn_torch(self):
        splits = {a: 1 for a in self.config.data_file}
        print(splits)
        record = MultiTFRecordDataset(data_pattern='{}',
                                      compression_type='gzip',
                                      splits=splits,
                                      index_pattern=None,
                                      shuffle_queue_size=self.config.shuffle_pool,
                                      infinite=False)
        dataloader = DataLoader(dataset=record,
                                # shuffle=True,
                                num_workers=16,
                                prefetch_factor=2,
                                pin_memory=True,
                                batch_size=self.config.batch_size,
                                collate_fn=self.read_and_decode,
                                drop_last=True,
                                pin_memory_device='cuda:0')
        return dataloader

The larger the number of num workers, the more steps, and the number of steps is proportional to the number of num workers, which is obviously wrong

It's recommended to create an index file for each TFRecord file. Index file must be provided when using multiple workers, otherwise the loader may return duplicate records.