worldbank/REaLTabFormer

Crash using model.predict() to predict a column

I-Mamalikidis opened this issue · 0 comments

I'm using Jupter Notebook 6.5.7, with REaLTabFormer 0.1.7, pandas 2.2.2, numpy 1.26.3 on Windows 11 23H2

I have a table which is the result of fully joining a SQL schema.

import pandas as pd

primary_keys = ["customer.id", "contract.id", "invoice.id", "payment.id"]

data = {
    "customer.id": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
    "customer.income": [110, 110, 110, 110, 110, 110, 110, 110, 110, 110],
    "contract.id": [1, 1, 1, 1, 1, 2, 2, 2, 2, 2],
    "contract.income": [20, 20, 20, 20, 20, 40, 40, 40, 40, 40],
    "invoice.id": [1, 1, 1, 2, 2, 3, 3, 3, 3, 4],
    "invoice.greater_than_contract.income": [111, 111, 111, 70, 70, 70, 35, 35, 35, 10],
    "payment.id": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "payment.amount": [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
    "payment.random": [10, 3432, 564, 34, 5432, 564564, 34, 432, 65, 7564345]
}
source_join_df = pd.DataFrame(data)
source_join_df

I'm training a tabular model and generate some synthetic data

import torch
import shutil
from realtabformer import REaLTabFormer
def _get_device():
    return 'cpu' if torch.cuda.device_count() == 0 else 'cuda'
def _sample_parent_model(model, n_samples, gen_batch, device):
    return model.sample(n_samples=n_samples, gen_batch=gen_batch, device=device)


training_execution_params = {
   "table_training": {
      "n_epochs": 3,
      "batch_size": 8,
      "n_gradient_accumulation_steps": 1,
      "train_size": 0.8,
      "early_stopping_patience": 5,
      "early_stopping_threshold": 0.0
   }
}

table_training_params = training_execution_params['table_training']

parent_model = REaLTabFormer(model_type="tabular",
                             batch_size=table_training_params['batch_size'],
                             epochs=table_training_params['n_epochs'],
                             gradient_accumulation_steps=table_training_params['n_gradient_accumulation_steps'],
                             logging_strategy="epoch",
                             evaluation_strategy="epoch",
                             save_strategy="epoch",
                             train_size=table_training_params['train_size'],
                             early_stopping_patience=table_training_params['early_stopping_patience'],
                             early_stopping_threshold=table_training_params['early_stopping_threshold'],
                             checkpoints_dir = f't0_checkpoints')

trainer = parent_model.fit(df=source_join_df.drop(columns = primary_keys),
                           n_critic=0,
                           device=_get_device())

try:
    shutil.rmtree(f'green_model')
except:
    pass
parent_model.save(f"green_model")

generation_execution_params = {
   "table_generation": {
      "batch_size": 8,
      "main_entity_table_n_samples": len(source_join_df)
   }
}

table_generation_params = generation_execution_params['table_generation']
lr_synth_data = _sample_parent_model(model = parent_model,
                                      n_samples=table_generation_params['main_entity_table_n_samples'],
                                      gen_batch=table_generation_params['batch_size'],
                                      device=_get_device())

I'm using the newly created synthetic data of the fully joined table to patch things on an original table and the resulting dataframe is patch_v1. For simplicity, let's say that everything is patched and patch_v1 is equal to the synthetic data

patch_v1 = lr_synth_data.copy()#.astype(str)
# patch_v1["customer.income"] = SpecialTokens.UNK
# patch_v1["contract.income"] = SpecialTokens.UNK
display(patch_v1)
display(patch_v1.dtypes)

Ideally, I want to be able to keep multiple columns frozen whilst predicting values for multiple other columns, but for simplicity let's say I want to predict 1 column and so I use the target_col argument of model.predict().

patch_v1 = parent_model.predict(patch_v1, target_col = "customer.income", batch = table_generation_params['batch_size'], device = _get_device())
display(patch_v1)

I'm getting AttributeError: Can only use .str accessor with string values! from process_data in realtabformer.data_utils.


AttributeError Traceback (most recent call last)
Cell In[10], line 1
----> 1 patch_v1 = parent_model.predict(patch_v1, target_col = "customer.income", batch = table_generation_params['batch_size'], device = _get_device())
2 display(patch_v1)

File C:\A3\envs\k2view\lib\site-packages\realtabformer\realtabformer.py:1376, in REaLTabFormer.predict(self, data, target_col, target_pos_val, batch, obs_sample, fillunk, device, disable_progress_bar, **generate_kwargs)
1357 tabular_sampler = TabularSampler.sampler_from_model(self, device=device)
1359 # TabularSampler(
1360 # model_type=self.model_type,
1361 # model=self.model,
(...)
1373 # device=device,
1374 # )
-> 1376 return tabular_sampler.predict(
1377 data=data,
1378 target_col=target_col,
1379 target_pos_val=target_pos_val,
1380 batch=batch,
1381 obs_sample=obs_sample,
1382 fillunk=fillunk,
1383 device=device,
1384 disable_progress_bar=disable_progress_bar,
1385 **generate_kwargs,
1386 )

File C:\A3\envs\k2view\lib\site-packages\realtabformer\rtf_sampler.py:722, in TabularSampler.predict(self, data, target_col, target_pos_val, batch, obs_sample, fillunk, device, disable_progress_bar, **generate_kwargs)
719 datasets.utils.disable_progress_bar()
721 for i in range(0, len(data), batch):
--> 722 seed_data = self._process_seed_input(data.iloc[i : i + batch])
723 if fillunk:
724 mode = seed_data.mode(dim=0).values

File C:\A3\envs\k2view\lib\site-packages\realtabformer\rtf_sampler.py:584, in TabularSampler._process_seed_input(self, seed_input)
580 seed_input = pd.DataFrame.from_dict({0: seed_input}, orient="index")
582 seed_input = seed_input[valid_cols]
--> 584 seed_data, _ = process_data(
585 df=seed_input, col_transform_data=self.col_transform_data
586 )
587 seed_data = make_dataset(seed_data, self.vocab, mask_rate=0, affix_eos=False)
589 generated = torch.tensor(seed_data["input_ids"])

File C:\A3\envs\k2view\lib\site-packages\realtabformer\data_utils.py:538, in process_data(df, numeric_max_len, numeric_precision, numeric_nparts, first_col_type, col_transform_data, target_col)
523 processed_df = pd.concat(
524 [
525 processed_df,
(...)
533 axis=1,
534 )
536 # Get the different sets of column types
537 cat_cols = processed_df.columns[
--> 538 processed_df.columns.str.contains(ColDataType.CATEGORICAL)
539 ]
540 numeric_cols = processed_df.columns[
541 ~processed_df.columns.str.contains(ColDataType.CATEGORICAL)
542 ]
544 if first_col_type == ColDataType.CATEGORICAL:

File C:\A3\envs\k2view\lib\site-packages\pandas\core\accessor.py:224, in CachedAccessor.get(self, obj, cls)
221 if obj is None:
222 # we're accessing the attribute of the class, i.e., Dataset.geo
223 return self._accessor
--> 224 accessor_obj = self._accessor(obj)
225 # Replace the property with the accessor object. Inspired by:
226 # https://www.pydanny.com/cached-property.html
227 # We need to use object.setattr because we overwrite setattr on
228 # NDFrame
229 object.setattr(obj, self._name, accessor_obj)

File C:\A3\envs\k2view\lib\site-packages\pandas\core\strings\accessor.py:191, in StringMethods.init(self, data)
188 def init(self, data) -> None:
189 from pandas.core.arrays.string_ import StringDtype
--> 191 self._inferred_dtype = self._validate(data)
192 self._is_categorical = isinstance(data.dtype, CategoricalDtype)
193 self._is_string = isinstance(data.dtype, StringDtype)

File C:\A3\envs\k2view\lib\site-packages\pandas\core\strings\accessor.py:245, in StringMethods._validate(data)
242 inferred_dtype = lib.infer_dtype(values, skipna=True)
244 if inferred_dtype not in allowed_types:
--> 245 raise AttributeError("Can only use .str accessor with string values!")
246 return inferred_dtype

AttributeError: Can only use .str accessor with string values!

What would be the correct way, given a trained model and a dataframe with identical schema to the training data, to predict 1 (or more) column(s) (simultaneously)?