skrub-data/skrub

adding `make_learner` to create a default pipeline for a given predictor

Closed this issue · 0 comments

Problem Description

As discussed IRL, it might be useful to have a function to help configure a simple pipeline that chains a TableVectorizer with a predictor chosen by the user.
The function would choose appropriate TableVectorizer parameters depending on the predictor -- for example MinHashEncoder is suitable for HistGradientBoostingClassifier but not for LogisticRegression

rough outline of the behavior:

from sklearn.base import BaseEstimator
from sklearn import ensemble
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

from skrub import TableVectorizer, MinHashEncoder, ToCategorical


def make_learner(predictor, n_jobs=None):
    vectorizer = TableVectorizer(n_jobs=n_jobs)
    match predictor:
        case "classifier":
            return make_learner(
                ensemble.HistGradientBoostingClassifier(
                    categorical_features="from_dtype"
                ),
                n_jobs=n_jobs,
            )
        case "regressor":
            return make_learner(
                ensemble.HistGradientBoostingRegressor(
                    categorical_features="from_dtype"
                ),
                n_jobs=n_jobs,
            )
        case ensemble.HistGradientBoostingClassifier(
            categorical_features="from_dtype"
        ) | ensemble.HistGradientBoostingRegressor(categorical_features="from_dtype"):
            vectorizer.set_params(
                low_cardinality_transformer=ToCategorical(),
                high_cardinality_transformer=MinHashEncoder(),
            )
        case (
            ensemble.HistGradientBoostingClassifier()
            | ensemble.HistGradientBoostingRegressor()
            | ensemble.RandomForestClassifier()
            | ensemble.RandomForestRegressor()
        ):
            vectorizer.set_params(
                low_cardinality_transformer=OrdinalEncoder(),
                high_cardinality_transformer=MinHashEncoder(),
            )
        case BaseEstimator():
            pass
        case cls if issubclass(cls, BaseEstimator):
            raise TypeError("pass an estimator instance not the class")
        case _:
            raise TypeError("pass a scikit-learn estimator")

    # TODO: should we add an imputer when the model does not have built-in
    # support for missing values?
    return make_pipeline(vectorizer, predictor)

Usage

>>> make_learner("regressor")
Pipeline(steps=[('tablevectorizer',
                 TableVectorizer(high_cardinality_transformer=MinHashEncoder(),
                                 low_cardinality_transformer=ToCategorical())),
                ('histgradientboostingregressor',
                 HistGradientBoostingRegressor(categorical_features='from_dtype'))])

>>> from sklearn.ensemble import HistGradientBoostingClassifier
>>> make_learner(HistGradientBoostingClassifier(), n_jobs=4)
Pipeline(steps=[('tablevectorizer',
                 TableVectorizer(high_cardinality_transformer=MinHashEncoder(),
                                 low_cardinality_transformer=OrdinalEncoder(),
                                 n_jobs=4)),
                ('histgradientboostingclassifier',
                 HistGradientBoostingClassifier())])

>>> from sklearn.linear_model import Ridge
>>> make_learner(Ridge())
Pipeline(steps=[('tablevectorizer', TableVectorizer()), ('ridge', Ridge())])

Feature Description

...

Alternative Solutions

No response

Additional Context

No response