google-deepmind/dmvr

How to prepare Multi Modal Dataset

Opened this issue · 3 comments

Thanks for your amazing work.

Error

         [[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext]
2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []

Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(google-research/google-research#925 (comment)). I too can run the provided video data hmdb example but facing troubles if audio, text were added.

I generated a single tf-record using generate_from_csv.py script provided in examples by using 4 videos with captions.

Attaching code for
dataset constructor class

from dmvr import tokenizers
import os
from dmvr import modalities
from dmvr import video_dataset

class ToyFactory2(video_dataset.BaseVideoDatasetFactory):
_SUBSETS = ('train', 'test', 'valid')
_SPLITS = (1, 2, 3)
_NUM_CLASSES = 4
_NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1}
def __init__(
    self,
    base_dir: str='generated_dataset',
    subset: str = 'valid',
    split: int = 1):
    """Constructor of ToyFactory2."""

    if subset not in ToyFactory2._SUBSETS:
    raise ValueError('Invalid subset "{}". The available subsets are: {}'
                    .format(subset, ToyFactory2._SUBSETS))

    if split not in ToyFactory2._SPLITS:
    raise ValueError('Invalid split "{}". The available splits are: {}'
                    .format(split, ToyFactory2._SPLITS))

    num_shards = self._NUM_SHARDS[subset]
    super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')])
def _build(self,
            is_training = True,
            # Video related parameters.
            num_frames = 32,
            stride = 1,
            num_test_clips = 1,
            min_resize = 256,
            crop_size = 224,
            multi_crop = False,
            crop_resize_style = 'Inception',
            min_aspect_ratio = 0.5,
            max_aspect_ratio = 2,
            min_area_ratio = 0.08,
            max_area_ratio = 1.0,
            zero_centering_image = False,
            color_augmentation = True,
            # Text related parameters.
            max_num_words = 16,
            max_context_sentences = 1,
            tokenizer = 'howto100m_en',
            prepend_bos = False,
            append_eos = False,
            keep_raw_string = False,
            # Audio related parameters.
            num_samples = 153600,  # 48000 (Hz) * 32 / 10 (fps)
            audio_stride = 1,
            sync_audio_and_image = True,
            # Label related parameters.
            one_hot_label = True,
            output_label_string = False,
            add_label_name = False,
            **kwargs):
    """Default build for this dataset.

    Args:
    is_training: Whether or not in training mode.
    num_frames: Number of frames per subclip. For single images, use 1.
    stride: Temporal stride to sample frames.
    num_test_clips: Number of test clips (1 by default). If more than 1, this
        will sample multiple linearly spaced clips within each video at test
        time. If 1, then a single clip in the middle of the video is sampled.
        The clips are aggreagated in the batch dimension.
    min_resize: Frames are resized so that `min(height, width)` is
        `min_resize`.
    crop_size: Final size of the frame after cropping the resized frames. Both
        height and width are the same.
    zero_centering_image: If `True`, frames are normalized to values in
        [-1, 1]. If `False`, values in [0, 1].
    one_hot_label: Return labels as one hot tensors.
    add_label_name: Also return the name of the label.
    """
    modalities.add_image(
        parser_builder=self.parser_builder,
        sampler_builder=self.sampler_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        postprocessor_builder=self.postprocessor_builder,
        is_training=is_training,
        num_frames=num_frames, stride=stride,
        num_test_clips=num_test_clips,
        min_resize=min_resize, crop_size=crop_size,
        zero_centering_image=zero_centering_image,
        input_feature_name="image/encoded")

    modalities.add_audio(
    parser_builder=self.parser_builder,
    sampler_builder=self.sampler_builder,
    postprocessor_builder=self.postprocessor_builder,
    # preprocessor_builder=preprocessor_builder,
    input_feature_name="WAVEFORM/feature/floats",
    output_feature_name='audio',
    is_training=is_training,
    num_samples=num_samples,
    stride=stride,
    num_test_clips=num_test_clips,
    sync_random_state=False,
    )

    self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt"))
    self.tokenizer.initialize()

    modalities.add_text(
    parser_builder=self.parser_builder,
    decoder_builder=self.decoder_builder,
    preprocessor_builder=self.preprocessor_builder,
    tokenizer=self.tokenizer,
    is_training=is_training,
    input_feature_name="caption/string",
    output_raw_string_name='text_string',
    output_feature_name='text',
    prepend_bos=prepend_bos,
    append_eos=append_eos,
    keep_raw_string=keep_raw_string,
    max_num_captions=1,
    max_num_tokens=16,
    sync_random_state=False,
    )

    modalities.add_label(
        parser_builder=self.parser_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        one_hot_label=one_hot_label,
        num_classes=ToyFactory2._NUM_CLASSES,
        add_label_name=add_label_name)

loading dmvr dataset

import tensorflow as tf
factory = ToyFactory2().configure(is_training=True)
ds = factory.make_dataset(batch_size=1)
data = next(iter(ds))
print(data.keys(), data)

Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.

Thanks

can you finish multimodal example

Thanks for your amazing work.

Error

         [[{{node ParseSingleSequenceExample/ParseSequenceExample/ParseSequenceExampleV2}}]] [Op:IteratorGetNext]
2022-04-15 13:19:55.258721: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.259783: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.261186: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []
2022-04-15 13:19:55.263445: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at example_parsing_ops.cc:480 : INVALID_ARGUMENT: Name: <unknown>, Key: image/encoded, Index: 0.  Number of values != expected.  values size: 100 but output shape: []

Using the hmdb example, I was creating dmvr multimodal dataset with images, audio, text, label. However facing the same errors raised by users earlier(google-research/google-research#925 (comment)). I too can run the provided video data hmdb example but facing troubles if audio, text were added.

I generated a single tf-record using generate_from_csv.py script provided in examples by using 4 videos with captions.

Attaching code for dataset constructor class

from dmvr import tokenizers
import os
from dmvr import modalities
from dmvr import video_dataset

class ToyFactory2(video_dataset.BaseVideoDatasetFactory):
_SUBSETS = ('train', 'test', 'valid')
_SPLITS = (1, 2, 3)
_NUM_CLASSES = 4
_NUM_SHARDS = {'train': 59, 'test': 39, 'valid': 1}
def __init__(
    self,
    base_dir: str='generated_dataset',
    subset: str = 'valid',
    split: int = 1):
    """Constructor of ToyFactory2."""

    if subset not in ToyFactory2._SUBSETS:
    raise ValueError('Invalid subset "{}". The available subsets are: {}'
                    .format(subset, ToyFactory2._SUBSETS))

    if split not in ToyFactory2._SPLITS:
    raise ValueError('Invalid split "{}". The available splits are: {}'
                    .format(split, ToyFactory2._SPLITS))

    num_shards = self._NUM_SHARDS[subset]
    super().__init__(shards=[os.path.join(base_dir, 'kinetics400_val-00000-of-00001')])
def _build(self,
            is_training = True,
            # Video related parameters.
            num_frames = 32,
            stride = 1,
            num_test_clips = 1,
            min_resize = 256,
            crop_size = 224,
            multi_crop = False,
            crop_resize_style = 'Inception',
            min_aspect_ratio = 0.5,
            max_aspect_ratio = 2,
            min_area_ratio = 0.08,
            max_area_ratio = 1.0,
            zero_centering_image = False,
            color_augmentation = True,
            # Text related parameters.
            max_num_words = 16,
            max_context_sentences = 1,
            tokenizer = 'howto100m_en',
            prepend_bos = False,
            append_eos = False,
            keep_raw_string = False,
            # Audio related parameters.
            num_samples = 153600,  # 48000 (Hz) * 32 / 10 (fps)
            audio_stride = 1,
            sync_audio_and_image = True,
            # Label related parameters.
            one_hot_label = True,
            output_label_string = False,
            add_label_name = False,
            **kwargs):
    """Default build for this dataset.

    Args:
    is_training: Whether or not in training mode.
    num_frames: Number of frames per subclip. For single images, use 1.
    stride: Temporal stride to sample frames.
    num_test_clips: Number of test clips (1 by default). If more than 1, this
        will sample multiple linearly spaced clips within each video at test
        time. If 1, then a single clip in the middle of the video is sampled.
        The clips are aggreagated in the batch dimension.
    min_resize: Frames are resized so that `min(height, width)` is
        `min_resize`.
    crop_size: Final size of the frame after cropping the resized frames. Both
        height and width are the same.
    zero_centering_image: If `True`, frames are normalized to values in
        [-1, 1]. If `False`, values in [0, 1].
    one_hot_label: Return labels as one hot tensors.
    add_label_name: Also return the name of the label.
    """
    modalities.add_image(
        parser_builder=self.parser_builder,
        sampler_builder=self.sampler_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        postprocessor_builder=self.postprocessor_builder,
        is_training=is_training,
        num_frames=num_frames, stride=stride,
        num_test_clips=num_test_clips,
        min_resize=min_resize, crop_size=crop_size,
        zero_centering_image=zero_centering_image,
        input_feature_name="image/encoded")

    modalities.add_audio(
    parser_builder=self.parser_builder,
    sampler_builder=self.sampler_builder,
    postprocessor_builder=self.postprocessor_builder,
    # preprocessor_builder=preprocessor_builder,
    input_feature_name="WAVEFORM/feature/floats",
    output_feature_name='audio',
    is_training=is_training,
    num_samples=num_samples,
    stride=stride,
    num_test_clips=num_test_clips,
    sync_random_state=False,
    )

    self.tokenizer = tokenizers.WordTokenizer(os.path.join('./misc', "howto100m_en" + ".txt"))
    self.tokenizer.initialize()

    modalities.add_text(
    parser_builder=self.parser_builder,
    decoder_builder=self.decoder_builder,
    preprocessor_builder=self.preprocessor_builder,
    tokenizer=self.tokenizer,
    is_training=is_training,
    input_feature_name="caption/string",
    output_raw_string_name='text_string',
    output_feature_name='text',
    prepend_bos=prepend_bos,
    append_eos=append_eos,
    keep_raw_string=keep_raw_string,
    max_num_captions=1,
    max_num_tokens=16,
    sync_random_state=False,
    )

    modalities.add_label(
        parser_builder=self.parser_builder,
        decoder_builder=self.decoder_builder,
        preprocessor_builder=self.preprocessor_builder,
        one_hot_label=one_hot_label,
        num_classes=ToyFactory2._NUM_CLASSES,
        add_label_name=add_label_name)

loading dmvr dataset

import tensorflow as tf
factory = ToyFactory2().configure(is_training=True)
ds = factory.make_dataset(batch_size=1)
data = next(iter(ds))
print(data.keys(), data)

Since VATT, google-scenic uses dmvr as dataloading step. It will be very helpful for all if any multimodal example is added in docs.

Thanks

can you finish

google-scenic uses dmvr as da

have you fix it? i also met the same problem,and it must be the dataset problem