keras-team/keras-tuner

`ValueError: y_true must be of shape [batch_size, 1]` for summarisation NLP model.

ktunk opened this issue · 0 comments

ktunk commented

Hi, I am working on summarisation task and I have used hugging-face 'google/mt5-small' pre-trained model for this. I am facing below ValueError for y_label shape while running below hyperparameter tunning code.

ValueError: y_true must be of shape [batch_size, 1]. Found shape: (None, 128)
Actually, my y_label data is summary text and I am passing its text-encodings to tuner.

code:

from kerastuner import HyperModel
import keras_nlp
import keras_tuner as kt
import transformers
from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

model_checkpoint = 'google/mt5-small'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

max_input_length = 1024
max_target_length = 128

train_texts, val_texts, train_labels, val_labels = train_test_split(data.document, 
                                                                    data.summary, 
                                                                    test_size=0.2, 
                                                                    random_state=42)

#create encodings
#encoding for x (document) data
x_train_encodings = tokenizer(train_texts.tolist(), truncation=True, max_length=max_input_length, padding="max_length", return_tensors='tf')
x_val_encodings = tokenizer(val_texts.tolist(), truncation=True, max_length=max_input_length, padding="max_length", return_tensors='tf')
print(x_train_encodings)

#encoding for y (summary) data
y_train_encodings = tokenizer(train_labels.tolist(), truncation=True, max_length=max_target_length, padding="max_length", return_tensors='tf')
y_val_encodings = tokenizer(val_labels.tolist(), truncation=True, max_length=max_target_length, padding="max_length", return_tensors='tf')
print(y_val_encodings)

BATCH_SIZE = 8

# create dataset
x_inputs = dict(x_train_encodings)
x_inputs['labels'] = y_train_encodings['input_ids']
train_tf_dataset = tf.data.Dataset.from_tensor_slices((x_inputs))
train_tf_dataset = train_tf_dataset.shuffle(len(x_train_encodings)).batch(BATCH_SIZE)

eval_inputs = dict(x_val_encodings)
eval_inputs['labels'] = y_val_encodings['input_ids']
eval_tf_dataset = tf.data.Dataset.from_tensor_slices(eval_inputs) 
eval_tf_dataset = eval_tf_dataset.batch(BATCH_SIZE) 

# tf_dataset will have this columns: 'input_ids', 'attention_mask', 'labels' 
# x_input: 'input_ids', 'attention_mask',
# y_label: 'labels'


# HyperModel class
class MyHyperModel(HyperModel) :
    def __init__(self, model_checkpoint):
        self.model = TFAutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
        
    def build(self, hp):
        hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4, 2e-5])
        hp_decay = hp.Int('decay', min_value=10, max_value=100, step=10 )
            
        rouge_n = keras_nlp.metrics.RougeN()
        self.model.compile(optimizer=keras.optimizers.Adam(learning_rate=hp_learning_rate), #, decay=hp_decay), 
                    metrics=['accuracy',rouge_n])
        return self.model


hyperModel = MyHyperModel(model_checkpoint)

tuner = kt.Hyperband(hyperModel,
                     objective=kt.Objective("rouge", direction="max"),
                     max_epochs=20,
                     factor=3,
                     overwrite=True,
                     directory="my_dir",
                     project_name="tune_hypermodel")


tuner.search(train_tf_dataset,
                epochs=25,
                batch_size=BATCH_SIZE,
                validation_data=eval_tf_dataset,
                verbose=2)