How to prepare Multi Modal Dataset
Opened this issue · 3 comments
Thanks for your amazing work.
Error
[[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext]
2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: []
2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: []
2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: []
2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: []
Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(google-research/google-research#925 (comment)). I too can run the provided video data hmdb example but facing troubles if audio, text were added.
I generated a single tf-record using generate_from_csv.py
script provided in examples by using 4 videos with captions.
Attaching code for
dataset constructor class
from dmvr import tokenizers
import os
from dmvr import modalities
from dmvr import video_dataset
class ToyFactory2(video_dataset.BaseVideoDatasetFactory):
_SUBSETS = ('train', 'test', 'valid')
_SPLITS = (1, 2, 3)
_NUM_CLASSES = 4
_NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1}
def __init__(
self,
base_dir: str='generated_dataset',
subset: str = 'valid',
split: int = 1):
"""Constructor of ToyFactory2."""
if subset not in ToyFactory2._SUBSETS:
raise ValueError('Invalid subset "{}". The available subsets are: {}'
.format(subset, ToyFactory2._SUBSETS))
if split not in ToyFactory2._SPLITS:
raise ValueError('Invalid split "{}". The available splits are: {}'
.format(split, ToyFactory2._SPLITS))
num_shards = self._NUM_SHARDS[subset]
super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')])
def _build(self,
is_training = True,
# Video related parameters.
num_frames = 32,
stride = 1,
num_test_clips = 1,
min_resize = 256,
crop_size = 224,
multi_crop = False,
crop_resize_style = 'Inception',
min_aspect_ratio = 0.5,
max_aspect_ratio = 2,
min_area_ratio = 0.08,
max_area_ratio = 1.0,
zero_centering_image = False,
color_augmentation = True,
# Text related parameters.
max_num_words = 16,
max_context_sentences = 1,
tokenizer = 'howto100m_en',
prepend_bos = False,
append_eos = False,
keep_raw_string = False,
# Audio related parameters.
num_samples = 153600, # 48000 (Hz) * 32 / 10 (fps)
audio_stride = 1,
sync_audio_and_image = True,
# Label related parameters.
one_hot_label = True,
output_label_string = False,
add_label_name = False,
**kwargs):
"""Default build for this dataset.
Args:
is_training: Whether or not in training mode.
num_frames: Number of frames per subclip. For single images, use 1.
stride: Temporal stride to sample frames.
num_test_clips: Number of test clips (1 by default). If more than 1, this
will sample multiple linearly spaced clips within each video at test
time. If 1, then a single clip in the middle of the video is sampled.
The clips are aggreagated in the batch dimension.
min_resize: Frames are resized so that `min(height, width)` is
`min_resize`.
crop_size: Final size of the frame after cropping the resized frames. Both
height and width are the same.
zero_centering_image: If `True`, frames are normalized to values in
[-1, 1]. If `False`, values in [0, 1].
one_hot_label: Return labels as one hot tensors.
add_label_name: Also return the name of the label.
"""
modalities.add_image(
parser_builder=self.parser_builder,
sampler_builder=self.sampler_builder,
decoder_builder=self.decoder_builder,
preprocessor_builder=self.preprocessor_builder,
postprocessor_builder=self.postprocessor_builder,
is_training=is_training,
num_frames=num_frames, stride=stride,
num_test_clips=num_test_clips,
min_resize=min_resize, crop_size=crop_size,
zero_centering_image=zero_centering_image,
input_feature_name="image/encoded")
modalities.add_audio(
parser_builder=self.parser_builder,
sampler_builder=self.sampler_builder,
postprocessor_builder=self.postprocessor_builder,
# preprocessor_builder=preprocessor_builder,
input_feature_name="WAVEFORM/feature/floats",
output_feature_name='audio',
is_training=is_training,
num_samples=num_samples,
stride=stride,
num_test_clips=num_test_clips,
sync_random_state=False,
)
self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt"))
self.tokenizer.initialize()
modalities.add_text(
parser_builder=self.parser_builder,
decoder_builder=self.decoder_builder,
preprocessor_builder=self.preprocessor_builder,
tokenizer=self.tokenizer,
is_training=is_training,
input_feature_name="caption/string",
output_raw_string_name='text_string',
output_feature_name='text',
prepend_bos=prepend_bos,
append_eos=append_eos,
keep_raw_string=keep_raw_string,
max_num_captions=1,
max_num_tokens=16,
sync_random_state=False,
)
modalities.add_label(
parser_builder=self.parser_builder,
decoder_builder=self.decoder_builder,
preprocessor_builder=self.preprocessor_builder,
one_hot_label=one_hot_label,
num_classes=ToyFactory2._NUM_CLASSES,
add_label_name=add_label_name)
loading dmvr dataset
import tensorflow as tf
factory = ToyFactory2().configure(is_training=True)
ds = factory.make_dataset(batch_size=1)
data = next(iter(ds))
print(data.keys(), data)
Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.
Thanks
can you finish multimodal example
Thanks for your amazing work.
Error
[[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext] 2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: [] 2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: [] 2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: [] 2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0. Number of values != expected. values size: 100 but output shape: []
Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(google-research/google-research#925 (comment)). I too can run the provided video data hmdb example but facing troubles if audio, text were added.
I generated a single tf-record using
generate_from_csv.py
script provided in examples by using 4 videos with captions.Attaching code for dataset constructor class
from dmvr import tokenizers import os from dmvr import modalities from dmvr import video_dataset class ToyFactory2(video_dataset.BaseVideoDatasetFactory): _SUBSETS = ('train', 'test', 'valid') _SPLITS = (1, 2, 3) _NUM_CLASSES = 4 _NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1} def __init__( self, base_dir: str='generated_dataset', subset: str = 'valid', split: int = 1): """Constructor of ToyFactory2.""" if subset not in ToyFactory2._SUBSETS: raise ValueError('Invalid subset "{}". The available subsets are: {}' .format(subset, ToyFactory2._SUBSETS)) if split not in ToyFactory2._SPLITS: raise ValueError('Invalid split "{}". The available splits are: {}' .format(split, ToyFactory2._SPLITS)) num_shards = self._NUM_SHARDS[subset] super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')]) def _build(self, is_training = True, # Video related parameters. num_frames = 32, stride = 1, num_test_clips = 1, min_resize = 256, crop_size = 224, multi_crop = False, crop_resize_style = 'Inception', min_aspect_ratio = 0.5, max_aspect_ratio = 2, min_area_ratio = 0.08, max_area_ratio = 1.0, zero_centering_image = False, color_augmentation = True, # Text related parameters. max_num_words = 16, max_context_sentences = 1, tokenizer = 'howto100m_en', prepend_bos = False, append_eos = False, keep_raw_string = False, # Audio related parameters. num_samples = 153600, # 48000 (Hz) * 32 / 10 (fps) audio_stride = 1, sync_audio_and_image = True, # Label related parameters. one_hot_label = True, output_label_string = False, add_label_name = False, **kwargs): """Default build for this dataset. Args: is_training: Whether or not in training mode. num_frames: Number of frames per subclip. For single images, use 1. stride: Temporal stride to sample frames. num_test_clips: Number of test clips (1 by default). If more than 1, this will sample multiple linearly spaced clips within each video at test time. If 1, then a single clip in the middle of the video is sampled. The clips are aggreagated in the batch dimension. min_resize: Frames are resized so that `min(height, width)` is `min_resize`. crop_size: Final size of the frame after cropping the resized frames. Both height and width are the same. zero_centering_image: If `True`, frames are normalized to values in [-1, 1]. If `False`, values in [0, 1]. one_hot_label: Return labels as one hot tensors. add_label_name: Also return the name of the label. """ modalities.add_image( parser_builder=self.parser_builder, sampler_builder=self.sampler_builder, decoder_builder=self.decoder_builder, preprocessor_builder=self.preprocessor_builder, postprocessor_builder=self.postprocessor_builder, is_training=is_training, num_frames=num_frames, stride=stride, num_test_clips=num_test_clips, min_resize=min_resize, crop_size=crop_size, zero_centering_image=zero_centering_image, input_feature_name="image/encoded") modalities.add_audio( parser_builder=self.parser_builder, sampler_builder=self.sampler_builder, postprocessor_builder=self.postprocessor_builder, # preprocessor_builder=preprocessor_builder, input_feature_name="WAVEFORM/feature/floats", output_feature_name='audio', is_training=is_training, num_samples=num_samples, stride=stride, num_test_clips=num_test_clips, sync_random_state=False, ) self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt")) self.tokenizer.initialize() modalities.add_text( parser_builder=self.parser_builder, decoder_builder=self.decoder_builder, preprocessor_builder=self.preprocessor_builder, tokenizer=self.tokenizer, is_training=is_training, input_feature_name="caption/string", output_raw_string_name='text_string', output_feature_name='text', prepend_bos=prepend_bos, append_eos=append_eos, keep_raw_string=keep_raw_string, max_num_captions=1, max_num_tokens=16, sync_random_state=False, ) modalities.add_label( parser_builder=self.parser_builder, decoder_builder=self.decoder_builder, preprocessor_builder=self.preprocessor_builder, one_hot_label=one_hot_label, num_classes=ToyFactory2._NUM_CLASSES, add_label_name=add_label_name)
loading dmvr dataset
import tensorflow as tf factory = ToyFactory2().configure(is_training=True) ds = factory.make_dataset(batch_size=1) data = next(iter(ds)) print(data.keys(), data)
Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.
Thanks
can you finish
google-scenic uses dmvr as da
have you fix it? i also met the same problem,and it must be the dataset problem