[example/nlp] Replace tokenizer from pretrain in keras_nlp_tokenizers got error
Opened this issue · 3 comments
Issue Type
Bug
Source
source
Keras Version
Keras 2.13.1
Custom Code
Yes
OS Platform and Distribution
Linux Ubuntu 20.04.5 LTS
Python version
3.8.18
GPU model and memory
CPU/378GB
Current Behavior?
After I run the example GPT text generation from scratch with KerasNLP in my computer, I want to replace the tokenizer in the scipt from keras_nlp/tokenizers to realize my design. However, I got error;
I also check the size of the train dataset using:
features = iter(train_ds).next()
features[0].shape
all output of them was TensorShape([64, 128])
, when batch_size was set to 64.
What I want to do is replacing the tokenizer in the script with the pretrain tokenizer.
I could not solve this problem by myself, so I open this issue for help.
Standalone code to reproduce the issue or tutorial link
import os
import keras_nlp
import keras
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings
# Data
BATCH_SIZE = 64
MIN_STRING_LEN = 512 # Strings shorter than this will be discarded
SEQ_LEN = 128 # Length of training sequences, in tokens
# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 128
NUM_HEADS = 3
NUM_LAYERS = 2
# VOCAB_SIZE = 5000 # Limits parameters in model.
# Training
EPOCHS = 5
# Inference
NUM_TOKENS_TO_GENERATE = 80
keras.utils.get_file(
origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
extract=True,
)
dir = os.path.expanduser("~/.keras/datasets/simplebooks/")
# Load simplebooks-92 train set and filter out short lines.
raw_train_ds = (
tf_data.TextLineDataset(dir + "simplebooks-92-raw/train.txt")
.filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
.batch(BATCH_SIZE)
.shuffle(buffer_size=256)
)
# Load simplebooks-92 validation set and filter out short lines.
raw_val_ds = (
tf_data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
.filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
.batch(BATCH_SIZE)
)
tokenizer = keras_nlp.models.GPT2Tokenizer.from_preset("gpt2_base_en")
vocab = tokenizer.get_vocabulary()
vocab_size = len(vocab)
gpt2_preprocessor = keras_nlp.models.GPT2Preprocessor(tokenizer, sequence_length=SEQ_LEN, add_start_token=False, add_end_token=False)
VOCAB_SIZE = len(vocab)
start_packer = keras_nlp.layers.StartEndPacker(
sequence_length=SEQ_LEN,
start_value=tokenizer.token_to_id("<|endoftext|>"),
)
def preprocess(inputs):
outputs = tokenizer(inputs)
features = start_packer(outputs)
labels = outputs
return features, labels
# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
tf_data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
tf_data.AUTOTUNE
)
inputs = keras.layers.Input(shape=(None,), dtype="int32")
# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
vocabulary_size=VOCAB_SIZE,
sequence_length=SEQ_LEN,
embedding_dim=EMBED_DIM,
mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
decoder_layer = keras_nlp.layers.TransformerDecoder(
num_heads=NUM_HEADS,
intermediate_dim=FEED_FORWARD_DIM,
)
x = decoder_layer(x) # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=tokenizer.token_to_id("!"))
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)
Relevant log output
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
/home/limingbo/projects/GPT2/text_genration_gpt_pretrain_tokenizer.ipynb 单元格 16 line 9
95 perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=tokenizer.token_to_id("!"))
96 model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
---> 98 model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)
File ~/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/utils/traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File /tmp/__autograph_generated_fileguxsxbh3.py:15, in outer_factory.<locals>.inner_factory.<locals>.tf__train_function(iterator)
13 try:
14 do_return = True
---> 15 retval_ = ag__.converted_call(ag__.ld(step_function), (ag__.ld(self), ag__.ld(iterator)), None, fscope)
16 except:
17 do_return = False
AttributeError: in user code:
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1338, in train_function *
return step_function(self, iterator)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1322, in step_function **
outputs = model.distribute_strategy.run(run_step, args=(data,))
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1303, in run_step **
outputs = model.train_step(data)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1081, in train_step
loss = self.compute_loss(x, y, y_pred, sample_weight)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/training.py", line 1139, in compute_loss
return self.compiled_loss(
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/engine/compile_utils.py", line 265, in __call__
loss_value = loss_obj(y_t, y_p, sample_weight=sw)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 142, in __call__
losses = call_fn(y_true, y_pred)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 268, in call **
return ag_fn(y_true, y_pred, **self._fn_kwargs)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 2385, in _ragged_tensor_sparse_categorical_crossentropy
return _ragged_tensor_apply_loss(fn, y_true, y_pred, y_pred_extra_dim=True)
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 1686, in _ragged_tensor_apply_loss
nested_splits_list = [rt.nested_row_splits for rt in (y_true, y_pred)]
File "/home/limingbo/anaconda3/envs/limingbo_sf/lib/python3.8/site-packages/keras/src/losses.py", line 1686, in <listcomp>
nested_splits_list = [rt.nested_row_splits for rt in (y_true, y_pred)]
AttributeError: 'Tensor' object has no attribute 'nested_row_splits'
I was able to reproduce the error, attaching the Gist here for reference. Thanks!
Thaks you for replying and review this issue! I have attached the Gist for reference. Please check your Colab notebook again.
@mattdangerw Would you like to take a look?
Thanks!