speaksee: A Python repository from AImageLab

Speaksee is a Python package that provides utilities for working with Visual-Semantic data, developed at AImageLab.

Installation

To have a working installation, make sure you have Python 3.5+. You can then install speaksee via pip:

pip install speaksee

From source

You can also install speaksee from source with:

git clone https://github.com/aimagelab/speaksee
cd speaksee
pip install -e .

and obtain fresh upgrades without reinstalling it, simply running:

git pull

Example(s)

Pre-processing visual data

from speaksee.data import ImageField, TextField
from speaksee.data.pipeline import EncodeCNN
from speaksee.data.dataset import COCO
from torchvision.models import resnet101
from torchvision.transforms import Compose, Normalize
from torch import nn
import torch
from tqdm import tqdm

device = torch.device('cuda')

# Preprocess with some fancy cnn and transformation
cnn = resnet101(pretrained=True).to(device)
cnn.avgpool.forward = lambda x : x.mean(-1).mean(-1)
cnn.fc = nn.Sequential()

transforms = Compose([
    ToTensor(),
    Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

prepro_pipeline = EncodeCNN(cnn, transforms)
image_field = ImageField(preprocessing=prepro_pipeline, precomp_path='/nas/houston/lorenzo/fc2k_coco.hdf5')

Pre-processing textual data

# Pipeline for text
text_field = TextField(eos_token='<eos>', lower=True, tokenize='spacy', remove_punctuation=True)

Calling a dataset

# Create the dataset
dataset = COCO(image_field, text_field, '/tmp/coco/images/',
               '/nas/houston/lorenzo/vse/data/coco/annotations',
               '/nas/houston/lorenzo/vse/data/coco/annotations')
train_dataset, val_dataset, test_dataset = dataset.splits
#image_field.precomp(dataset)  # do this once, or to refresh cache (we might change this in the near future)
text_field.build_vocab(train_dataset, val_dataset, min_freq=5)

Training a model

from speaksee.models import FC
model = FC(len(text_field.vocab), 2048, 512, 512, dropout_prob_lm=0).to(device)

from speaksee.data import DataLoader
dataloader_train = DataLoader(train_dataset, batch_size=16, shuffle=True)
dataloader_val = DataLoader(val_dataset, batch_size=16)

from torch.optim import Adam
from torch.optim.lr_scheduler import StepLR
from torch.nn import NLLLoss
optim = Adam(model.parameters(), lr=5e-4)
scheduler = StepLR(optim, step_size=3, gamma=.8)
loss_fn = NLLLoss(ignore_index=text_field.vocab.stoi['<pad>'])

for e in range(50):
    # Training
    model.train()
    running_loss = .0
    with tqdm(desc='Epoch %d - train' % e, unit='it', total=len(dataloader_train)) as pbar:
        for it, (images, captions )in enumerate(dataloader_train):
            images, captions = images.to(device), captions.to(device)
            out = model(images, captions)
            optim.zero_grad()
            loss = loss_fn(out.view(-1, len(text_field.vocab)), captions.view(-1))
            loss.backward()
            optim.step()

            running_loss += loss.item()
            pbar.set_postfix(loss=running_loss / (it+1))
            pbar.update()

    if e % 3 == 0 and model.ss_prob < .25:
        model.ss_prob += .05

    # Validation
    model.eval()
    running_loss = .0
    with tqdm(desc='Epoch %d - val' % e, unit='it', total=len(dataloader_val)) as pbar:
        for it, (images, captions )in enumerate(dataloader_val):
            images, captions = images.to(device), captions.to(device)
            out = model(images, captions)
            loss = loss_fn(out.view(-1, len(text_field.vocab)), captions.view(-1))

            running_loss += loss.item()
            pbar.set_postfix(loss=running_loss / (it+1))
            pbar.update()

    # Serialize model
    torch.save({
        'epoch': e,
        'val_loss': running_loss / len(iter(dataloader_val)),
        'state_dict': model.state_dict(),
        'optimizer': optim.state_dict(),
    }, '/nas/houston/lorenzo/fc_epoch_%03d.pth' % e)

Evaluating a model

from speaksee.evaluation import Cider
from speaksee.evaluation import PTBTokenizer
dict_dataset_val = val_dataset.image_dictionary({'image': image_field, 'text': RawField()})
dict_dataloader_val = DataLoader(dict_dataset_val, batch_size=16)
gen = {}
gts = {}
with tqdm(desc='Validation', unit='it', total=len(dict_dataloader_val)) as pbar:
    for it, (images, caps_gt) in enumerate(iter(dict_dataloader_val)):
        images = images.to(device)
        with torch.no_grad():
            out = model.beam_search(images, 50, text_field.vocab.stoi['<eos>'], 2, out_size=1)
        caps_gen = text_field.decode(out)
        for i, (gts_i, gen_i) in enumerate(zip(caps_gt, caps_gen)):
            gen['%d_%d' % (it, i)] = [gen_i, ]
            gts['%d_%d' % (it, i)] = gts_i
        pbar.update()

gts = PTBTokenizer.tokenize(gts)
gen = PTBTokenizer.tokenize(gen)
val_cider, _ = Cider().compute_score(gts, gen)
print("CIDEr is %f" % val_cider)

Model zoo

Model	CIDEr	Download
FC-2k (beam)	93.8	Download
Bottomup Topdown with sentinel	117.8	Download

The team

Speaksee is currently maintained by Lorenzo Baraldi, Marcella Cornia and Matteo Stefanini