num_workers affect train step num
SunnyGhj opened this issue · 1 comments
SunnyGhj commented
class Data:
def __init__(self, config: DataConfig):
self.config = config
def read_and_decode(self, batch):
raw_labels = np.array([raw['labels'] for raw in batch])
raw_feats = np.array([raw['features'] for raw in batch])
click_label = torch.as_tensor(data=raw_labels[:, 1], dtype=torch.float)
add_label = torch.as_tensor(data=raw_labels[:, 2], dtype=torch.float)
order_label = torch.as_tensor(data=raw_labels[:, 3], dtype=torch.float)
labels = {'click_label': click_label, 'add_label': add_label, 'order_label': order_label}
offset, features = 0, {}
fea_config = self.config.fea_config
for feat in fea_config['features']:
feat_name = feat['prefix']
feat_size = feat['size']
features[feat_name] = torch.as_tensor(data=raw_feats[:, offset:offset + feat_size], dtype=torch.int)
offset += feat_size
return features, labels
def data_input_fn_torch(self):
splits = {a: 1 for a in self.config.data_file}
print(splits)
record = MultiTFRecordDataset(data_pattern='{}',
compression_type='gzip',
splits=splits,
index_pattern=None,
shuffle_queue_size=self.config.shuffle_pool,
infinite=False)
dataloader = DataLoader(dataset=record,
# shuffle=True,
num_workers=16,
prefetch_factor=2,
pin_memory=True,
batch_size=self.config.batch_size,
collate_fn=self.read_and_decode,
drop_last=True,
pin_memory_device='cuda:0')
return dataloader
The larger the number of num workers, the more steps, and the number of steps is proportional to the number of num workers, which is obviously wrong
SunnyGhj commented
It's recommended to create an index file for each TFRecord file. Index file must be provided when using multiple workers, otherwise the loader may return duplicate records.