Trampoline detection using CNN

Intro

This repo contains some code used in my research project on neighborhood effects in consumption. You can download a working paper on my personal website.

Trampolines are popular among Swedish families. Thanks to their size and distinct shape, they can be detected in an aerial photo. To collect data on trampoline ownership, I apply an instance of Inception ResNet to a large set of aerial photos of Swedish neighborhoods taken between 2006 and 2018.

Image preprocessing

Each raw image is a 10,000 by 10,000 pixel GeoTIFF. As my data contains several thousand images, importing all of them into QGIS is not feasible. In geotiff_coverage.py, I extract metadata from each image and it's corresponding json meta file and collect the geographic extent of each image in a shapefile:

import fiona
from shapely.geometry import mapping, shape, box, Point
import os, os.path
from osgeo import gdal

data = gdal.Open(picFolder+file+'.tif', gdal.GA_ReadOnly)
geoTransform = data.GetGeoTransform()
minx = geoTransform[0]
maxy = geoTransform[3]
maxx = minx + geoTransform[1] * data.RasterXSize
miny = maxy + geoTransform[5] * data.RasterYSize
geom = box(minx,miny,maxx,maxy)
img_id = file[0:11]
img_year = file[12:]
yeard[picFolder+file+'.tif'] = int(img_year)
metapath = metaFolder+img_id+'_flygbild_'+img_year+'.json'

with open(metapath) as metafile:    
    metadata = json.load(metafile)
    try:
        img_date = metadata['features'][0]['properties']['tidpunkt'][0:10]
        img_camera = metadata['features'][0]['properties']['kamera']
        dest.write({'geometry': mapping(geom), 'properties':{ 'FILENAME' : file, 
                                                              'ID' : img_id, 
                                                              'DATE': img_date,
                                                              'YEAR': img_year,
                                                              'CAMERA': img_camera}})

Using GDAL, I then create virtual mosaics (VRT) containing all the photos from a particular year. To slice each image into chips, each containing a single-family propoerty, I use administrative shapefiles containing the centroid coordinates of every building and polygons for each property parcel. From shp_preprocess.py:

BUILDING_SHP = 'by_all.shp'
LOT_SHP = 'ay_all.shp'

Using the building code field in by_all.shp, I select the subset of land parcels that intersect with a single-family home:

import fiona
from shapely.geometry import shape, mapping

TARGET_SHP = outFolder+'Lots_all_10000.shp'

maxArea = 10000
             
source = fiona.open(LOT_SHP, 'r', encoding='iso-8859-1')     

polygons = [pol for pol in source]
points = [pt for pt in fiona.open(POINTS_SHP)]
    
idx = index.Index()
for pos, poly in enumerate(polygons):
    idx.insert(pos, shape(poly['geometry']).bounds)
    
    #iterate through points
    with fiona.open(TARGET_SHP, 'w', driver=source.driver, crs=source.crs, schema=source.schema) as dest:
        for i,pt in enumerate(points):
            point = shape(pt['geometry'])
            minArea = maxArea
            minpoly = {}
            # iterate through spatial index
            for j in idx.intersection(point.coords[0]):
                print(point.coords)
                geom = shape(polygons[j]["geometry"])
                area = geom.area
                if point.within(geom) and area < maxArea:
                    #only write smallest matching property for each point
                    if area < minArea:
                        minArea = area
                        minpoly = polygons[j]
                if len(minpoly) > 0:
                    print(pt['id'])
                    dest.write(minpoly)

With the relevant properties selected, cut_imgs.py slices the images to 300x300 pixel jpegs (about 75 by 75 meters on the ground). By iterating over the featues in TARGET_SHP created about, I apply gdalwarp to the photos with each property outlining the cut.

I also set a minimum dimension to discard lots that are too small (30 pixels corresponds to about 7.5 meters). The script also incorporates a simple rotation algorithm that attempts to fit large properties into a 300 by 300 bounding box by rotating them (the angel argument in the toJpeg function)

import fiona
from shapely.geometry import shape, box, Point
from shapely.affinity import rotate
import os, os.path
from PIL import Image
from osgeo import gdal

def toJpeg(im,filename,max_side_dim,outFolder,angle):
    size = (max_side_dim, max_side_dim)
    if angle == 0:
        background = Image.new('RGB', size, (0, 0, 0)) 
        background.paste(im, (int((size[0] - im.size[0]) / 2), int((size[1] - im.size[1]) / 2)))
        if background.getbbox():
            background.save(outFolder+filename+'.jpg', 'JPEG', quality = 95)
            print("Jpeg exported.")
            return 1
        else:
            print("All black!")
            return 2
     ...

min_side_dim = 30
max_side_dim = 300

with fiona.open(SOURCE_SHP, 'r') as source:
    for idx, feat in enumerate(source):
            with fiona.open(path=inFolder+"tempfile.shp", mode='w', driver=source.driver, schema=source.schema, crs=source.crs) as tempshp:
                tempshp.write(feat)
            outtile = gdal.Warp(outFolder+newfile+".tif", SOURCE_GTIFF, format = 'GTiff', cutlineDSName=inFolder+"tempfile.shp",                                         cropToCutline=True)
            outtile = None
            im = Image.open(outFolder+newfile+".tif", 'r')
            w, h = im.size
            if min(h, w) > min_side_dim and max(h, w) < max_side_dim:
                r = 0
                cut = toJpeg(im,newfile,max_side_dim,outFolder,r)

Using the exported images, I manually label a set of training images (N=22,435). For ambigous cases, I create a script that asks the Google Maps Static API for a reference photo of the same coordinates:

import pickle
import time
import requests

def second_opinion(id,coords,topath):
    GOOGLE_MAPS_API_URL = 'https://maps.googleapis.com/maps/api/staticmap'

    coords = str(coords[1]) + "," + str(coords[0])
    
    params = {
            'key': API_KEY,
            'size': '512x512',
            'maptype': 'satellite',
            'zoom': '20',
            'format': 'jpg',
            'center': coords
        }
    response = requests.get(GOOGLE_MAPS_API_URL, params=params)
    print(response.status_code)
    time.sleep(1)
        
    filepath = topath + '_gmaps.jpg'
    print(filepath)
            
    with open(filepath, 'wb') as f:
        f.write(response.content)

#pickled dictionary with unique property id and centroid coordinate (WGS84)
ay_dict = pickle.load(open("ay_point_wgs.p", "rb" ))

second_opinion(id_dict, ay_dict.get(id_dict), topath) 

Since the Google Maps image is often of higher resolution than my source images, it can facilitate labeling in ambigous cases as shown here:

Training

For training, I use the Keras implementation of Inception ResNet which I instantiate with pre-trained ImageNet weights and a Tensorflow GPU backend. All training is done on an Nvidia GTX1080Ti.

    
from keras import Model
from keras.applications.inception_resnet_v2 import InceptionResNetV2
from keras.applications.inception_resnet_v2 import preprocess_input
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import Callback
from keras.callbacks import ModelCheckpoint
from keras.layers.core import Dense
from keras.layers import Input
from keras.models import Sequential
from keras.optimizers import Adam

in_shape = (300,300,3)
input = Input(shape=in_shape,name = 'image_input')

#parameters
EPOCHS = 30
BATCH_SIZE = 32
LR = 0.001
DECAY = 0.001
MOMENTUM = 0.9
BETA_1 = 0.9
BETA_2 = 0.999
EPSILON = 1e-08

#model
base_model = InceptionResNetV2(include_top=False, weights='imagenet', input_tensor=None, input_shape=in_shape, pooling='avg')

top_model = Sequential()
top_model.add(Dense(1, input_shape=(base_model.output_shape[1:]), activation='sigmoid'))
model = Model(inputs=base_model.input, outputs= top_model(base_model.output))

adam = Adam(lr=LR, beta_1=BETA_1, beta_2=BETA_2, epsilon=EPSILON, decay=0.0, amsgrad=False)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics = ['accuracy'])

model.summary()

I split my training data 80-10-10. To adress the trampoline/no trampoline imbalance I manually oversample the trampoline class to achieve a 50-50 balance for training and validation. I use the flow_from_directory method and standard augmentation techniques (flip, shift and rotate)

train_datagen = ImageDataGenerator(
    width_shift_range=0.1,
    height_shift_range=0.1,
    vertical_flip=True,
    rotation_range=90,
    horizontal_flip=True,
    samplewise_center=False,
    samplewise_std_normalization=False,
    preprocessing_function=preprocess_input
    )

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=img_size,
    batch_size=BATCH_SIZE,
    shuffle=True,
    class_mode='binary')

I also create callback objects for logging and for gradually reducing the learning rate as the validation accuracy stops improving. The LrReducer class allows the user to set parameters that determine the learning rate reduction and early stopping conditions.

class LrReducer(Callback):
    def __init__(self, patience=1, reduce_rate=0.5, reduce_nb=6, verbose=1):
        super(Callback, self).__init__()
        self.patience = patience
        self.wait = 0
        self.best_score = -1.
        self.reduce_rate = reduce_rate
        self.current_reduce_nb = 0
        self.reduce_nb = reduce_nb
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs={}):
        current_score = logs.get('val_acc')
        if current_score > self.best_score:
            self.best_score = current_score
            self.wait = 0
            if self.verbose > 0:
                print('---current best val accuracy: %.3f' % current_score)
        else:
            if self.wait >= self.patience:
                self.current_reduce_nb += 1
                if self.current_reduce_nb <= self.reduce_nb:
                    lr = K.get_value(self.model.optimizer.lr)
                    #lr = self.model.optimizer.lr.get_value()
                    K.set_value(self.model.optimizer.lr,lr*self.reduce_rate)
                    new_lr = K.get_value(self.model.optimizer.lr)
                    #self.model.optimizer.lr.set_value(lr*self.reduce_rate)
                    if self.verbose > 0:
                        print('---LR reduced to: %f' % new_lr)
                else:
                    if self.verbose > 0:
                        print("Epoch %d: early stopping" % (epoch))
                    self.model.stop_training = True
            self.wait += 1

logCallback = EndCallback(timestr)
lrReduce = LrReducer()

filepath = model_path+"/ResNet-{epoch:02d}-{val_acc:.3f}.h5"  # unique file name that will include the epoch and the validation acc for that epoch
Checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') # saves only the best ones

model.fit_generator(
    train_generator,
    steps_per_epoch=int(train_size/BATCH_SIZE),
    epochs=EPOCHS,
    validation_data=validation_generator,
    validation_steps=int(val_size/BATCH_SIZE), callbacks=[logCallback, lrReduce, Checkpoint])

model.save_weights("models/{}_weights.h5".format(NAME))
model.save("models/{}.h5".format(NAME))

Testing

In validation.py, I evaluate final model performance on the test set of manually labeled images (N = 1,819):

Visualizing the result

To visualize the prediction, I can overlay the predicted class for each property on the original photo (red = 'No trampoline', blue = 'Trampoline')