topepo/caret

(1) preProcess not working inside train? (2) make preProcess transform response inside train.

Opened this issue · 0 comments

train, preProcess
Bug? Categorical variables preprocessed differently inside train vs. outside train.
Bug? Does not preprocess inside train despite saying it does.
Enhancement: Make preProcess transform response inside train
Thanks for caret and tidy models!

library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice

df <- mtcars[c("mpg", "wt", "am")]
# Change am to factor ("0" or "1")
df$am <- factor(df$am)
df[c(1, 4), ]
#>                 mpg    wt am
#> Mazda RX4      21.0 2.620  1
#> Hornet 4 Drive 21.4 3.215  0
str(df, nchar.max = 20)
#> 'data.frame':    32 obs. of  3 variables:
#>  $ mpg: num  21 21| __truncated__ ...
#>  $ wt : num  2.62 | __truncated__ ...
#>  $ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...

# preProcess only transforms predictors inside train, not response.
# Exclude response (mpg) to make preProcesses of two models similar.
preProcValues <- caret::preProcess(df[c("wt", "am")], method = c("center", "scale"))
trainTransformed <- predict(preProcValues, df)
model_formula <- formula(mpg ~ wt + am)

# transformed outside train
preproc_out <- caret::train(
  form = model_formula,
  data = trainTransformed,
  method = "lm"
)

# transformed inside train
preproc_in <- caret::train(
  form = model_formula,
  data = df,
  method = "lm",
  preProcess = c("center", "scale")
)

preProcValues # Categorical variable ignored outside train
#> Created from 32 samples and 2 variables
#> 
#> Pre-processing:
#>   - centered (1)
#>   - ignored (1)
#>   - scaled (1)
preproc_in$preProcess # Categorical variable not ignored inside train
#> Created from 32 samples and 2 variables
#> 
#> Pre-processing:
#>   - centered (2)
#>   - ignored (0)
#>   - scaled (2)
preproc_out$trainingData[c(1, 4), ] # centred and scaled outside train
#>                .outcome           wt am
#> Mazda RX4          21.0 -0.610399567  1
#> Hornet 4 Drive     21.4 -0.002299538  0
preproc_in$trainingData[c(1, 4), ] # not centred and scaled inside train, but says they were
#>                .outcome    wt am
#> Mazda RX4          21.0 2.620  1
#> Hornet 4 Drive     21.4 3.215  0

identical(preProcValues, preproc_in$preProcess)
#> [1] FALSE
identical(preproc_out$trainingData, preproc_in$trainingData)
#> [1] FALSE
identical(preproc_out$finalModel, preproc_in$finalModel)
#> [1] FALSE

Created on 2023-05-06 with reprex v2.0.2

Session info
sessioninfo::session_info()
#> ─ Session info ───────────────────────────────────────────────────────────────
#>  setting  value
#>  version  R version 4.2.3 (2023-03-15)
#>  os       macOS Ventura 13.3.1
#>  system   aarch64, darwin20
#>  ui       X11
#>  language (EN)
#>  collate  en_US.UTF-8
#>  ctype    en_US.UTF-8
#>  tz       America/Toronto
#>  date     2023-05-06
#>  pandoc   2.19.2 @ /Applications/RStudio.app/Contents/Resources/app/quarto/bin/tools/ (via rmarkdown)
#> 
#> ─ Packages ───────────────────────────────────────────────────────────────────
#>  package      * version    date (UTC) lib source
#>  caret        * 6.0-94     2023-03-21 [1] CRAN (R 4.2.0)
#>  class          7.3-22     2023-05-03 [1] CRAN (R 4.2.3)
#>  cli            3.6.1      2023-03-23 [1] CRAN (R 4.2.0)
#>  codetools      0.2-19     2023-02-01 [1] CRAN (R 4.2.3)
#>  colorspace     2.1-0      2023-01-23 [1] CRAN (R 4.2.0)
#>  data.table     1.14.8     2023-02-17 [1] CRAN (R 4.2.0)
#>  digest         0.6.31     2022-12-11 [1] CRAN (R 4.2.0)
#>  dplyr          1.1.2      2023-04-20 [1] CRAN (R 4.2.0)
#>  evaluate       0.21       2023-05-05 [1] CRAN (R 4.2.0)
#>  fansi          1.0.4      2023-01-22 [1] CRAN (R 4.2.0)
#>  fastmap        1.1.1      2023-02-24 [1] CRAN (R 4.2.0)
#>  foreach        1.5.2      2022-02-02 [1] CRAN (R 4.2.0)
#>  fs             1.6.2      2023-04-25 [1] CRAN (R 4.2.0)
#>  future         1.32.0     2023-03-07 [1] CRAN (R 4.2.0)
#>  future.apply   1.10.0     2022-11-05 [1] CRAN (R 4.2.0)
#>  generics       0.1.3      2022-07-05 [1] CRAN (R 4.2.0)
#>  ggplot2      * 3.4.2      2023-04-03 [1] CRAN (R 4.2.0)
#>  globals        0.16.2     2022-11-21 [1] CRAN (R 4.2.0)
#>  glue           1.6.2      2022-02-24 [1] CRAN (R 4.2.0)
#>  gower          1.0.1      2022-12-22 [1] CRAN (R 4.2.0)
#>  gtable         0.3.3      2023-03-21 [1] CRAN (R 4.2.0)
#>  hardhat        1.3.0      2023-03-30 [1] CRAN (R 4.2.3)
#>  htmltools      0.5.5      2023-03-23 [1] CRAN (R 4.2.0)
#>  ipred          0.9-14     2023-03-09 [1] CRAN (R 4.2.0)
#>  iterators      1.0.14     2022-02-05 [1] CRAN (R 4.2.0)
#>  knitr          1.42       2023-01-25 [1] CRAN (R 4.2.0)
#>  lattice      * 0.21-8     2023-04-05 [1] CRAN (R 4.2.0)
#>  lava           1.7.2.1    2023-02-27 [1] CRAN (R 4.2.0)
#>  lifecycle      1.0.3      2022-10-07 [1] CRAN (R 4.2.0)
#>  listenv        0.9.0      2022-12-16 [1] CRAN (R 4.2.0)
#>  lubridate      1.9.2      2023-02-10 [1] CRAN (R 4.2.0)
#>  magrittr       2.0.3      2022-03-30 [1] CRAN (R 4.2.0)
#>  MASS           7.3-60     2023-05-04 [1] CRAN (R 4.2.0)
#>  Matrix         1.5-4      2023-04-04 [1] CRAN (R 4.2.0)
#>  ModelMetrics   1.2.2.2    2020-03-17 [1] CRAN (R 4.2.0)
#>  munsell        0.5.0      2018-06-12 [1] CRAN (R 4.2.0)
#>  nlme           3.1-162    2023-01-31 [1] CRAN (R 4.2.3)
#>  nnet           7.3-19     2023-05-03 [1] CRAN (R 4.2.3)
#>  parallelly     1.35.0     2023-03-23 [1] CRAN (R 4.2.0)
#>  pillar         1.9.0      2023-03-22 [1] CRAN (R 4.2.0)
#>  pkgconfig      2.0.3      2019-09-22 [1] CRAN (R 4.2.0)
#>  plyr           1.8.8      2022-11-11 [1] CRAN (R 4.2.0)
#>  pROC           1.18.0     2021-09-03 [1] CRAN (R 4.2.0)
#>  prodlim        2023.03.31 2023-04-02 [1] CRAN (R 4.2.0)
#>  purrr          1.0.1      2023-01-10 [1] CRAN (R 4.2.0)
#>  R.cache        0.16.0     2022-07-21 [1] CRAN (R 4.2.0)
#>  R.methodsS3    1.8.2      2022-06-13 [1] CRAN (R 4.2.0)
#>  R.oo           1.25.0     2022-06-12 [1] CRAN (R 4.2.0)
#>  R.utils        2.12.2     2022-11-11 [1] CRAN (R 4.2.0)
#>  R6             2.5.1      2021-08-19 [1] CRAN (R 4.2.0)
#>  Rcpp           1.0.10     2023-01-22 [1] CRAN (R 4.2.0)
#>  recipes        1.0.6      2023-04-25 [1] CRAN (R 4.2.0)
#>  reprex         2.0.2      2022-08-17 [1] CRAN (R 4.2.0)
#>  reshape2       1.4.4      2020-04-09 [1] CRAN (R 4.2.0)
#>  rlang          1.1.1      2023-04-28 [1] CRAN (R 4.2.0)
#>  rmarkdown      2.21       2023-03-26 [1] CRAN (R 4.2.3)
#>  rpart          4.1.19     2022-10-21 [1] CRAN (R 4.2.3)
#>  rstudioapi     0.14       2022-08-22 [1] CRAN (R 4.2.0)
#>  scales         1.2.1      2022-08-20 [1] CRAN (R 4.2.0)
#>  sessioninfo    1.2.2      2021-12-06 [1] CRAN (R 4.2.0)
#>  stringi        1.7.12     2023-01-11 [1] CRAN (R 4.2.0)
#>  stringr        1.5.0      2022-12-02 [1] CRAN (R 4.2.0)
#>  styler         1.9.1      2023-03-04 [1] CRAN (R 4.2.0)
#>  survival       3.5-5      2023-03-12 [1] CRAN (R 4.2.0)
#>  tibble         3.2.1      2023-03-20 [1] CRAN (R 4.2.0)
#>  tidyselect     1.2.0      2022-10-10 [1] CRAN (R 4.2.0)
#>  timechange     0.2.0      2023-01-11 [1] CRAN (R 4.2.0)
#>  timeDate       4022.108   2023-01-07 [1] CRAN (R 4.2.0)
#>  utf8           1.2.3      2023-01-31 [1] CRAN (R 4.2.0)
#>  vctrs          0.6.2      2023-04-19 [1] CRAN (R 4.2.0)
#>  withr          2.5.0      2022-03-03 [1] CRAN (R 4.2.0)
#>  xfun           0.39       2023-04-20 [1] CRAN (R 4.2.0)
#>  yaml           2.3.7      2023-01-23 [1] CRAN (R 4.2.0)
#> 
#>  [1] /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
#> 
#> ──────────────────────────────────────────────────────────────────────────────