/BERT_with_keras

A Keras version of Google's BERT model

Primary LanguagePythonMIT LicenseMIT

BERT_with_keras

This is a implementation of BERT(Bidirectional Encoder Representation of Transformer) with Keras.

The backend of Keras must be tensorflow.

Usage

Here is a quick-start example to preprocess raw data for pretraining and fine-tuning for text classification. For more details, see Predicting Movie Review Sentriment with BERT

Data

Let's use Standord's Large Movie Review Dataset for BERT pretraining and fine-tuning, the code below, which downloads,extracts and imports the dateset, is borrowed from this tensorflow tutorial. The dataset consists of IMDB movie reviews labeled by positivity from 1 to 10.

import os
import re
import tensorflow as tf
import pandas as pd

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with tf.gfile.GFile(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))
    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    dataset = tf.keras.utils.get_file(
        fname="aclImdb.tar.gz", 
        origin="http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz", 
        extract=True)
  
    train_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                         "aclImdb", "train"))
    test_df = load_dataset(os.path.join(os.path.dirname(dataset), 
                                          "aclImdb", "test"))
    return train_df, test_df
 
train, test = download_and_load_datasets()

pre-training

let's train a bert pre-training model.

import os
import spacy
from const import bert_data_path,bert_model_path
from preprocess import create_pretraining_data_from_docs
from pretraining import bert_pretraining
nlp = spacy.load('en')

# use IMDB movie review as pretraining data
texts = train['sentence'].tolist() + test['sentence'].tolist()

sentences_texts=[]
for text in texts:
    doc = nlp(text)
    sentences_texts.append([s.text for s in doc.sents])

vocab_path = os.path.join(bert_data_path, 'vocab.txt')

create_pretraining_data_from_docs(sentences_texts,
                                  vocab_path=vocab_path,
                                  save_path=os.path.join(bert_data_path,'pretraining_data.npz'),
                                  token_method='wordpiece',
                                  language='en',
                                  dupe_factor=10)

bert_pretraining(train_data_path=os.path.join(bert_data_path,'pretraining_data.npz'),
                 bert_config_file=os.path.join(bert_data_path, 'bert_config.json'),
                 save_path=bert_model_path,
                 batch_size=32,
                 seq_length=128,
                 max_predictions_per_seq=20,
                 val_batch_size=32,
                 multi_gpu=0,
                 num_warmup_steps=1000,
                 checkpoints_interval_steps=1000,
                 pretraining_model_name='bert_pretraining.h5',
                 encoder_model_name='bert_encoder.h5')

Then, pertraining data would be found in save_dir.

Fine-tuning

You can use the pre-training model as the initial point for your NLP model. For example, you can use the pre-training model to init a classfier model.

import os
import keras
import numpy as np
from const import bert_data_path, bert_model_path
from modeling import BertConfig
from classifier import SingleSeqDataProcessor, convert_examples_to_features, Text_Classifier, save_features, TextSequence
from tokenization import FullTokenizer
from optimization import AdamWeightDecayOpt
from checkpoint import StepModelCheckpoint

# data preprossing
train_examples = SingleSeqDataProcessor.get_train_examples(train_data=train['sentence'].tolist(),labels=train['polarity'].tolist())
dev_exmaples = SingleSeqDataProcessor.get_dev_examples(dev_data=test['sentence'].tolist(), labels=test['polarity'].tolist())

# `word piece tokenizer` need to a prepared vocabulary.
vocab_path = os.path.join(bert_data_path, 'vocab.txt')

# load vocab to tokenizer
tokenizer = FullTokenizer(vocab_path, do_lower_case=True)

# convert the train and dev examples to features
train_features = convert_examples_to_features(train_examples, 
                                              label_list=[0,1], 
                                              max_seq_length=128, 
                                              tokenizer= tokenizer)
dev_features = convert_examples_to_features(dev_exmaples, label_list=[0,1], max_seq_length=128, tokenizer=tokenizer)

# convert features to a dictionary of numpy arrays.
train_features_array_dict = save_features(features=train_features)
dev_features_array_dict = save_features(features=dev_features)

# get train and validation data
train_x = [train_features_array_dict['input_ids'], train_features_array_dict['input_mask'], train_features_array_dict['segment_ids']]
train_y = keras.utils.to_categorical(train_features_array_dict['label_ids'], 2)
val_x = [dev_features_array_dict['input_ids'], dev_features_array_dict['input_mask'], dev_features_array_dict['segment_ids']]
val_y = keras.utils.to_categorical(dev_features_array_dict['label_ids'],2)

# load bert configuration file
config = BertConfig.from_json_file(os.path.join(bert_data_path, 'bert_config.json'))
epochs = 3
num_gpus = None
# if you come across a OOM problem, reduce the batch size.
batch_size = 16

# calculation the number of training steps by epoch size.
num_train_samples = len(train_features_array_dict['input_ids'])
num_train_steps = int(np.ceil(num_train_samples / batch_size)) * epochs
print("number of train steps: {}".format(num_train_steps))

# Use weight decay adam optimizer. this optimizer is sightly different with Keras's Standard Adam optimizer. 
# For more details, view source code of AdamWeightDecayOpt.
adam = AdamWeightDecayOpt(
        lr=5e-5,
        num_train_steps=num_train_steps,
        num_warmup_steps=100,
        beta_1=0.9,
        beta_2=0.999,
        epsilon=1e-6,
        weight_decay_rate=0.01,
        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"]
    )

# This checkpoint evaluate the bert model performance on batch end.
checkpoint = StepModelCheckpoint(filepath="%s/%s" % (bert_model_path, 'imdb_classifer_model.h5'),
                                 verbose=1, monitor='val_acc',
                                 save_best_only=True,
                                 xlen=3,
                                 period=100,
                                 start_step=100,
                                 val_batch_size=128)
# create a model
classifier = Text_Classifier(bert_config=config,
                             pretrain_model_path=os.path.join(bert_model_path, 'bert_movie_reviews_encoder.h5'),
                             batch_size=batch_size,
                             seq_length=128,
                             optimizer=adam,
                             num_classes=2,
                             metrics=['acc'],
                             multi_gpu= num_gpus
                             )

# when using multi-gpus, the parallel model of bert cann't be used to evaluate/predict.
# You can only use the cpu_build model to evalate and predict.
if num_gpus is not None:
    checkpoint.single_gpu_model = classifier.model

# train model
generator= TextSequence(x=train_x,y=train_y,batch_size=batch_size)
history = classifier.fit_generator(generator=generator,
                                   epochs=epochs,
                                   shuffle=True,
                                   callbacks=[checkpoint],
                                   validation_data=(val_x,val_y)
                                   )