Doodleverse/segmentation_gym

Rewrite augmentation pipeline

ebgoldstein opened this issue · 14 comments

In the TF docs from 2.9 on, tf.keras.preprocessing has a deprecation warning:
https://www.tensorflow.org/versions/r2.10/api_docs/python/tf/keras/preprocessing

This will impact the make_data script, which relies on this suite of tools (i.e., tf.keras.preprocessing.image.ImageDataGenerator) to make the augmented imagery. See here:

print("Creating augmented files")
# we create two instances with the same arguments
data_gen_args = dict(featurewise_center=False,
featurewise_std_normalization=False,
rotation_range=AUG_ROT,
width_shift_range=AUG_WIDTHSHIFT,
height_shift_range=AUG_HEIGHTSHIFT,
fill_mode='reflect', #'nearest',
zoom_range=AUG_ZOOM,
horizontal_flip=AUG_HFLIP,
vertical_flip=AUG_VFLIP)
null_data_gen_args = dict(featurewise_center=False,
featurewise_std_normalization=False,
rotation_range=0,
width_shift_range=0,
height_shift_range=0,
fill_mode='reflect',
zoom_range=0,
horizontal_flip=False,
vertical_flip=False)
#get image dimensions
NX = TARGET_SIZE[0]
NY = TARGET_SIZE[1]
null_image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**null_data_gen_args)
mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**data_gen_args)
null_mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**null_data_gen_args)
# important that each band has the same image generator
image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**data_gen_args)
## put images in subfolders
for counter,w in enumerate(W):
n_im = len(glob(w+os.sep+'*.png')+glob(w+os.sep+'*.jpg'))
if n_im>0:
try:
os.mkdir(w+os.sep+'images')
except:
pass
for file in glob(w+os.sep+'*.png'):
try:
shutil.move(file,w+os.sep+'images')
except:
pass
for file in glob(w+os.sep+'*.jpg'):
try:
shutil.move(file,w+os.sep+'images')
except:
pass
n_im = len(glob(w+os.sep+'images'+os.sep+'*.*'))
## put label images in subfolders
n_im = len(glob(label_data_path+os.sep+'*.png')+glob(label_data_path+os.sep+'*.jpg'))
if n_im>0:
try:
os.mkdir(label_data_path+os.sep+'images')
except:
pass
for file in glob(label_data_path+os.sep+'*.jpg'):
try:
shutil.move(file,label_data_path+os.sep+'images')
except:
pass
for file in glob(label_data_path+os.sep+'*.png'):
try:
shutil.move(file,label_data_path+os.sep+'images')
except:
pass
n_im = len(glob(label_data_path+os.sep+'images'+os.sep+'*.*'))
#### make training generators directly, and in advance
train_generators = []
null_train_generators = []
for counter,w in enumerate(W):
print("folder: {}".format(w.split(os.sep)[-1]))
img_generator = image_datagen.flow_from_directory(
w,
target_size=(NX, NY),
batch_size=int(n_im/AUG_LOOPS),
class_mode=None, seed=SEED, shuffle=False)
null_img_generator = null_image_datagen.flow_from_directory(
w,
target_size=(NX, NY),
batch_size=int(n_im/AUG_LOOPS),
class_mode=None, seed=SEED, shuffle=False)
print("folder: {}".format(label_data_path.split(os.sep)[-1]))
#the seed must be the same as for the training set to get the same images
mask_generator = mask_datagen.flow_from_directory(
label_data_path,
target_size=(NX, NY),
batch_size=int(n_im/AUG_LOOPS),
class_mode=None, seed=SEED, shuffle=False, color_mode="grayscale", interpolation="nearest")
null_mask_generator = null_mask_datagen.flow_from_directory(
label_data_path,
target_size=(NX, NY),
batch_size=int(n_im/AUG_LOOPS),
class_mode=None, seed=SEED, shuffle=False, color_mode="grayscale", interpolation="nearest")
train_generator = (pair for pair in zip(img_generator, mask_generator))
train_generators.append([img_generator,mask_generator,train_generator])
null_train_generator = (pair for pair in zip(null_img_generator, null_mask_generator))
null_train_generators.append([null_img_generator, null_mask_generator,null_train_generator])
######################## generate and print files
if 'REMAP_CLASSES' in locals():
NCLASSES = len(np.unique([REMAP_CLASSES[k] for k in REMAP_CLASSES]))
i = 0
for copy in tqdm(range(AUG_COPIES)):
for k in range(AUG_LOOPS):
# print("Working on copy number {} out of {}".format(copy,AUG_COPIES))
# print("Working on loop {} out of {}".format(k,AUG_LOOPS))
# print("Starting from augmented sample {}".format(i))
X=[]; Y=[]; F=[]
for counter,train_generator in enumerate(train_generators):
#grab a batch of images and label images
x, y = next(train_generator[-1])
y = np.round(y)
idx = np.maximum((train_generator[0].batch_index - 1) * train_generator[0].batch_size, 0)
filenames = train_generator[0].filenames[idx : idx + train_generator[0].batch_size]
X.append(x)
del x
Y.append(y)
del y
F.append(filenames)
del filenames
## for 1-band inputs, the generator will make 3-band inputs
## this is so
## that means for 3+ band inputs where the extra files encode just 1 band each
## single bands are triplicated and the following code removes the redundancy
## so check for bands 0 and 1 being the same and if so, use only bans 0
X3 = []
for x in X:
x3=[]
for im in x:
if np.all(im[:,:,0]==im[:,:,1]):
im = im[:,:,0]
x3.append(im)
X3.append(x3)
del X
Y = Y[0]
# wrute them to file and increment the counter
for counter,lab in enumerate(Y):
im = np.dstack([x[counter] for x in X3])
files = np.dstack([x[counter] for x in F])
##============================================ label
if NCLASSES==1:
lab=lab.squeeze()
#lab[lab>0]=1
if NCLASSES==1:
l = lab.astype(np.uint8)
else:
l = np.round(lab[:,:,0]).astype(np.uint8)
if 'REMAP_CLASSES' in locals():
for k in REMAP_CLASSES.items():
l[l==int(k[0])] = int(k[1])
else:
l[l>NCLASSES]=NCLASSES
if len(np.unique(l))==1:
nx,ny = l.shape
if NCLASSES==1:
lstack = np.zeros((nx,ny,NCLASSES+1))
else:
lstack = np.zeros((nx,ny,NCLASSES))
lstack[:,:,np.unique(l)[0]]=np.ones((nx,ny))
else:
nx,ny = l.shape
if NCLASSES==1:
lstack = np.zeros((nx,ny,NCLASSES+1))
lstack[:,:,:NCLASSES+1] = (np.arange(NCLASSES+1) == 1+l[...,None]-1).astype(int) #one-hot encode
else:
lstack = np.zeros((nx,ny,NCLASSES))
lstack[:,:,:NCLASSES] = (np.arange(NCLASSES) == 1+l[...,None]-1).astype(int) #one-hot encode
if FILTER_VALUE>1:
for kk in range(lstack.shape[-1]):
#l = median(lstack[:,:,kk], disk(FILTER_VALUE))
l = remove_small_objects(lstack[:,:,kk].astype('uint8')>0, np.pi*(FILTER_VALUE**2))
l = remove_small_holes(lstack[:,:,kk].astype('uint8')>0, np.pi*(FILTER_VALUE**2))
lstack[:,:,kk] = np.round(l).astype(np.uint8)
del l
datadict={}
datadict['arr_0'] = im.astype(np.uint8)
datadict['arr_1'] = np.squeeze(lstack).astype(np.uint8)
datadict['num_bands'] = im.shape[-1]
try:
datadict['files'] = [fi.split(os.sep)[-1] for fi in files.squeeze()]
except:
datadict['files'] = [files]
np.savez_compressed(output_data_path+os.sep+ROOT_STRING+'_aug_nd_data_000000'+str(i),
**datadict)
del lstack, l, im
i += 1

In light of this, it seems wise to think/plan/prepare for the moment when we need to convert the augmentation routines to the recommended workflow using tf.keras.utils.. the relevant links in the TF documentation can be found in the link above.

note that this has been discussed: #60

https://albumentations.ai/docs/api_reference/augmentations/ seems best, especially because we are concerned with environmental imagery, and the functional augs include sun glint, snow, and fog https://albumentations.ai/docs/api_reference/augmentations/functional/

2024 and this is still a christmas wish

I think I could take this on this year and would base it around

dataset = tf.keras.utils.image_dataset_from_directory(
    folder,
    labels='inferred',
    label_mode='int',
    class_names=None,
    batch_size=32,
    image_size=TARGET_SIZE,
    shuffle=False,
    seed=None,
    validation_split=None,
    subset=None,
    interpolation="bilinear"
)

Question: so I am guessing these augmentations get done at the time of training, and new images are not actually saved?
I think it would be easier (at least for me) to integrate albumentations by actually saving the augmented images with the rest of the dataset.

Correct. Gym works by preparing your dataset for you and making batched tensors of augmented data. This is deliberately done so you always know what data is used for training and what for validation. Importantly only the training data is augmented.

I would recommend we eventually modified the make_dataset.py function with an albumentations based workflow. But yes for now you could trial model training by augmenting the imagery first. But note that would be suboptimal in the long term because it needlessly duplicates image files. So let's put a basic wirkflow together and then ideally wrap that into the existing Gym workflow.

Just so we are all on the same page - make_datasets actually creates the augmented images, which are saved as npz files. then train_model uses those (augmented) images (which are npz) to train the model. So images are not augmented 'on the fly' like in many workflows (i.e., preprocessing layers in the model, data generators, etc), but rather pre-augmented. I recall the biggest reason we did this was for efficiency (GPU utilization is always near 100% for me, compared with many 'on the fly' augmentation strategies where GPu utilization is lower, at the expense of more CPU)

@mlundine - i agree that albumentations is the correct way to go.
@dbuscombe-usgs - i agree that we don;t want to duplicate/save augmented images

Yes that's a good summary. Pre augmentation (as oppsed to on the fly) has reproducibility benefits too. In the sense that the augmented data are saved in the "gpu ready" npz format, and it would be possible to in theory assess the distributions of augmented data post-hoc rather than the non-reproducible ad-hoc.

I think we're all interested in albumentations and I'm keen to get it at least as an option in the gym workflow

@mlundine - just loopiong back to getting Albumentations working w/o rewriting the augmentation pipeline:

Since we use the deprecated/old-style keras generators, the easiest method is to add a preprocessing function (https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator) in 3 easy steps:

  1. course adding an import:
#import albumentations
import albumentations as A
  1. defining a preprocessing function with your chosen albumentation augs:
#preprocessing function with albumentations.. example with channel shuffle
def albumentize(image):
    aug = A.Compose([
        A.ChannelShuffle(),
    ])
    AugI = aug(image=image)['image']

    return AugI
  1. add a call to the preprocessing function on line 719-739 of make_dataset.py

so add

preprocessing_function = albumentize, under fill_mode='reflect',

for both generators

hope this helps as a quick way to get Albumentations working!

# we create two instances with the same arguments
data_gen_args = dict(featurewise_center=False,
featurewise_std_normalization=False,
rotation_range=AUG_ROT,
width_shift_range=AUG_WIDTHSHIFT,
height_shift_range=AUG_HEIGHTSHIFT,
fill_mode='reflect', #'nearest',
zoom_range=AUG_ZOOM,
horizontal_flip=AUG_HFLIP,
vertical_flip=AUG_VFLIP)
null_data_gen_args = dict(featurewise_center=False,
featurewise_std_normalization=False,
rotation_range=0,
width_shift_range=0,
height_shift_range=0,
fill_mode='reflect',
zoom_range=0,
horizontal_flip=False,
vertical_flip=False)
#get image dimensions
NX = TARGET_SIZE[0]
NY = TARGET_SIZE[1]
null_image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**null_data_gen_args)
mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**data_gen_args)
null_mask_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**null_data_gen_args)
# important that each band has the same image generator
image_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**data_gen_args)

Clarifying that more: we don't want duplicates (original image and
augmented) in the training set? Or do we want a big training set with all
original images plus each augmentation?

The way the we wrote it, the trainign split will all be augmentations, Val split is all non-augmented images in the validation. That being said, all the augmentations are random, so there is a possibility to get nonagumented (or weakly augmented) images in the training.

note also that in the config, AUG_COPIES will oversample your training split, so you can give it a bunch of different augmented copies of the training data...

I suggest if you want an albumentation version of Gym, feel free to create a branch (locally or on GH)... you could hard code it all in for your personal needs, but it would be awesome if you added variables to the config so that they can be turned on/off globally for everyone eventually

I agree with Evan. It seems the change he is suggesting here #81 (comment) is simple enough it could be incorporated in the existing workflow easily (on a new branch). Doodleverse is definitely designed with a broad range of users and use-cases in mind. Perhaps it could be passed a list of albumentations-style augmentations you'd like. And if the list if empty (default), it just defaults to the status quo.

And yes, I have noticed that models tend to train better when presented with original plus augmented training data. There is no data leakage because the validation files are stored in a separate folder and are not augmented. If you wish to test this yourself,

  1. run make_datasets.py, then train_model.py to train a model
  2. delete all the non-augmented data (the files say 'noaug' in the name), then train_model.py again
  3. compare the 2 models

If you wish, you could add a config file parameter than suppresses the use of original imagery in training, but I recommend keeping original+augmentation by default