MachineShop: Machine Learning Models and Tools

Overview

MachineShop is a meta-package for statistical and machine learning with a common interface for model fitting, prediction, performance assessment, and presentation of results. Support is provided for predictive modeling of numerical, categorical, and censored time-to-event outcomes, including those listed in the table below, and for resample (bootstrap, cross-validation, and split training-test sets) estimation of model performance.

		Response Variable Types
Method	Constructor	Categorical¹	Continuous²	Survival³
Bagging with Classification Trees	AdaBagModel	f
Boosting with Classification Trees	AdaBoostModel	f
Gradient Boosting with Regression Trees	BlackBoostModel	b	n	S
C5.0 Classification	C50Model	f
Conditional Random Forests	CForestModel	f	n	S
Cox Regression	CoxModel			S
Cox Regression (Stepwise)	CoxStepAICModel			S
Multivariate Adaptive Regression Splines	EarthModel	f	n
Flexible Discriminant Analysis	FDAModel	f
Gradient Boosting with Additive Models	GAMBoostModel	b	n	S
Generalized Boosted Regression	GBMModel	f	n	S
Gradient Boosting with Linear Models	GLMBoostModel	b	n	S
Generalized Linear Models	GLMModel	b	n
Generalized Linear Models (Stepwise)	GLMStepAICModel	b	n
Lasso and Elastic-Net	GLMNetModel	f	m, n	S
K-Nearest Neighbors Model	KNNModel	f, o	n
Linear Discriminant Analysis	LDAModel	f
Linear Model	LMModel	f	m, n
Mixture Discriminant Analysis	MDAModel	f
Naive Bayes Classifier	NaiveBayesModel	f
Feed-Forward Neural Networks	NNetModel	f	n
Penalized Discriminant Analysis	PDAModel	f
Partial Least Squares	PLSModel	f	n
Ordered Logistic Regression	POLRModel	o
Quadratic Discriminant Analysis	QDAModel	f
Random Forests	RandomForestModel	f	n
Fast Random Forests	RangerModel	f	n	S
Recursive Partitioning and Regression Trees	RPartModel	f	n	S
Stacked Regression	StackedModel	f, o	m, n	S
Super Learner	SuperModel	f, o	m, n	S
Parametric Survival	SurvRegModel			S
Parametric Survival (Stepwise)	SurvRegStepAICModel			S
Support Vector Machines	SVMModel	f	n
Support Vector Machines (ANOVA)	SVMANOVAModel	f	n
Suplport Vector Machines (Bessel)	SVMBesselModel	f	n
Support Vector Machines (Laplace)	SVMLaplaceModel	f	n
Support Vector Machines (Linear)	SVMLinearModel	f	n
Support Vector Machines (Poly)	SVMPolyModel	f	n
Support Vector Machines (Radial)	SVMRadialModel	f	n
Support Vector Machines (Spline)	SVMSplineModel	f	n
Support Vector Machines (Tanh)	SVMTanhModel	f	n
Regression and Classification Trees	TreeModel	f	n
Extreme Gradient Boosting	XGBModel	f	n
Extreme Gradient Boosting (DART)	XGBDARTModel	f	n
Extreme Gradient Boosting (Linear)	XGBLinearModel	f	n
Extreme Gradient Boosting (Tree)	XGBTreeModel	f	n
¹ b = binary, f = factor, o = ordered
² m = matrix, n = numeric
³ S = Surv

Installation

# Current release from CRAN
install.packages("MachineShop")

# Development version from GitHub
# install.packages("devtools")
devtools::install_github("brian-j-smith/MachineShop", ref = "develop")

# Development version with vignettes
devtools::install_github("brian-j-smith/MachineShop", ref = "develop", build_vignettes = TRUE)

Documentation

Once the package is installed, general documentation on its usage can be viewed with the following console commands.

library(MachineShop)

# Package help summary
?MachineShop

# Vignette
RShowDoc("Introduction", package = "MachineShop")

Parallel Computing

Resampling algorithms will be executed in parallel automatically if a parallel backend for the foreach package, such as doParallel, is loaded.

library(doParallel)
registerDoParallel(cores = 4)

Example

The following is a brief example illustrating use of the package to predict the species of flowers in Edgar Anderson’s iris data set.

Training and Test Set Analysis

## Load the package
library(MachineShop)
library(magrittr)

## Iris flower species (3 level response) data set
head(iris)
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
#> 1          5.1         3.5          1.4         0.2  setosa
#> 2          4.9         3.0          1.4         0.2  setosa
#> 3          4.7         3.2          1.3         0.2  setosa
#> 4          4.6         3.1          1.5         0.2  setosa
#> 5          5.0         3.6          1.4         0.2  setosa
#> 6          5.4         3.9          1.7         0.4  setosa

## Training and test sets
set.seed(123)
trainindices <- sample(nrow(iris), nrow(iris) * 2 / 3)
train <- iris[trainindices, ]
test <- iris[-trainindices, ]

## Model formula
fo <- Species ~ .

## Models by response type
modelinfo(factor(0)) %>% names
#>  [1] "AdaBagModel"       "AdaBoostModel"     "C50Model"         
#>  [4] "CForestModel"      "EarthModel"        "FDAModel"         
#>  [7] "GBMModel"          "GLMNetModel"       "KNNModel"         
#> [10] "LDAModel"          "LMModel"           "MDAModel"         
#> [13] "NaiveBayesModel"   "NNetModel"         "PDAModel"         
#> [16] "PLSModel"          "QDAModel"          "RandomForestModel"
#> [19] "RangerModel"       "RPartModel"        "StackedModel"     
#> [22] "SuperModel"        "SVMModel"          "SVMANOVAModel"    
#> [25] "SVMBesselModel"    "SVMLaplaceModel"   "SVMLinearModel"   
#> [28] "SVMPolyModel"      "SVMRadialModel"    "SVMSplineModel"   
#> [31] "SVMTanhModel"      "TreeModel"         "XGBModel"         
#> [34] "XGBDARTModel"      "XGBLinearModel"    "XGBTreeModel"

## Model-specific information
modelinfo(GBMModel)
#> $GBMModel
#> $GBMModel$label
#> [1] "Generalized Boosted Regression"
#> 
#> $GBMModel$packages
#> [1] "gbm"
#> 
#> $GBMModel$types
#> [1] "factor"  "numeric" "Surv"   
#> 
#> $GBMModel$arguments
#> function (distribution = NULL, n.trees = 100, interaction.depth = 1, 
#>     n.minobsinnode = 10, shrinkage = 0.1, bag.fraction = 0.5) 
#> NULL
#> 
#> $GBMModel$varimp
#> [1] TRUE

## Generalized boosted model fit to training set
gbmfit <- fit(fo, data = train, model = GBMModel)

## Variable importance
(vi <- varimp(gbmfit))
#>                 Overall
#> Petal.Length 100.000000
#> Petal.Width   14.601856
#> Sepal.Width    1.438558
#> Sepal.Length   0.000000

plot(vi)

## Test set predicted probabilities
predict(gbmfit, newdata = test, type = "prob") %>% head
#>         setosa   versicolor    virginica
#> [1,] 0.9999737 2.627994e-05 4.493784e-08
#> [2,] 0.9999154 8.456383e-05 4.205211e-09
#> [3,] 0.9999154 8.456383e-05 4.205211e-09
#> [4,] 0.9999737 2.627994e-05 4.493784e-08
#> [5,] 0.9998807 1.192834e-04 2.987679e-09
#> [6,] 0.9999024 9.764241e-05 2.445639e-09

## Test set predicted classifications
predict(gbmfit, newdata = test) %>% head
#> [1] setosa setosa setosa setosa setosa setosa
#> Levels: setosa versicolor virginica

## Test set performance
obs <- response(fo, data = test)
pred <- predict(gbmfit, newdata = test, type = "prob")
modelmetrics(obs, pred)
#>  Accuracy     Kappa     Brier 
#> 0.9200000 0.8793727 0.1502442

Resampling

## Resample estimation of model performance
(res <- resample(fo, data = iris, model = GBMModel, control = CVControl))
#> An object of class "Resamples"
#> 
#> Models: GBMModel
#> 
#> Stratification variable: (strata) 
#> 
#> Resamples control object of class "CVMLControl"
#> 
#> Method: K-Fold Cross-Validation
#> 
#> Folds: 10
#> 
#> Repeats: 1
#> 
#> Survival times: 
#> 
#> Seed: 253452646

summary(res)
#>                Mean    Median         SD          Min       Max NA
#> Accuracy 0.95333333 0.9333333 0.03220306 9.333333e-01 1.0000000  0
#> Kappa    0.93000000 0.9000000 0.04830459 9.000000e-01 1.0000000  0
#> Brier    0.08476969 0.1133233 0.05621648 5.140496e-05 0.1372037  0

plot(res)

Model Metrics

## Default model metrics
modelmetrics(res) %>% summary
#>                Mean    Median         SD          Min       Max NA
#> Accuracy 0.95333333 0.9333333 0.03220306 9.333333e-01 1.0000000  0
#> Kappa    0.93000000 0.9000000 0.04830459 9.000000e-01 1.0000000  0
#> Brier    0.08476969 0.1133233 0.05621648 5.140496e-05 0.1372037  0

## All available metric functions
metricinfo() %>% names
#>  [1] "accuracy"        "brier"           "cindex"         
#>  [4] "cross_entropy"   "f_score"         "kappa2"         
#>  [7] "mae"             "mse"             "npv"            
#> [10] "ppv"             "pr_auc"          "precision"      
#> [13] "r2"              "recall"          "roc_auc"        
#> [16] "roc_index"       "sensitivity"     "specificity"    
#> [19] "weighted_kappa2"

## Metrics available for resample output
metricinfo(res) %>% names
#> [1] "accuracy"      "brier"         "cross_entropy" "kappa2"

## User-specified model metrics
modelmetrics(res, c("accuracy", "kappa2")) %>% summary
#>               Mean    Median         SD       Min Max NA
#> accuracy 0.9533333 0.9333333 0.03220306 0.9333333   1  0
#> kappa2   0.9300000 0.9000000 0.04830459 0.9000000   1  0

Model Tuning

## Tune over a grid of model parameters
gbmtune <- tune(fo, data = iris, model = GBMModel,
                grid = expand.grid(n.trees = c(25, 50, 100),
                                   interaction.depth = 1:3,
                                   n.minobsinnode = c(5, 10)))

plot(gbmtune, type = "line")

## Fit the selected model
gbmtunefit <- fit(fo, data = iris, model = gbmtune)
varimp(gbmtunefit)
#>                 Overall
#> Petal.Length 100.000000
#> Petal.Width   19.311911
#> Sepal.Width    2.500572
#> Sepal.Length   0.000000

Model Comparisons

## Model comparisons
control <- CVControl(folds = 10, repeats = 5)

gbmres <- resample(fo, data = iris, model = GBMModel(n.tree = 50), control = control)
rfres <- resample(fo, data = iris, model = RandomForestModel(ntree = 50), control = control)
nnetres <- resample(fo, data = iris, model = NNetModel(size = 5), control = control)

res <- Resamples(GBM = gbmres, RF = rfres, NNet = nnetres)
summary(res)
#> , , Accuracy
#> 
#>           Mean    Median         SD Min Max NA
#> GBM  0.9440000 0.9333333 0.05447752 0.8   1  0
#> NNet 0.9373333 0.9666667 0.08771217 0.6   1  0
#> RF   0.9520000 0.9333333 0.05720631 0.8   1  0
#> 
#> , , Kappa
#> 
#>       Mean Median         SD Min Max NA
#> GBM  0.916   0.90 0.08171628 0.7   1  0
#> NNet 0.892   0.95 0.15758380 0.4   1  0
#> RF   0.928   0.90 0.08580947 0.7   1  0
#> 
#> , , Brier
#> 
#>            Mean     Median         SD          Min       Max NA
#> GBM  0.08739305 0.06993792 0.08997327 5.611940e-05 0.3655810  0
#> NNet 0.09728523 0.06078362 0.11257530 5.069556e-26 0.3333334  0
#> RF   0.07043733 0.04421333 0.07650326 5.333333e-05 0.3091200  0

plot(res)

## Pairwise model differences and t-tests
perfdiff <- diff(res)
summary(perfdiff)
#> , , Accuracy
#> 
#>                    Mean Median         SD         Min        Max NA
#> GBM - NNet  0.022666667      0 0.12749817 -0.13333333 0.40000000  0
#> GBM - RF   -0.009333333      0 0.03301893 -0.06666667 0.06666667  0
#> NNet - RF  -0.032000000      0 0.11913521 -0.40000000 0.13333333  0
#> 
#> , , Kappa
#> 
#>              Mean Median         SD  Min Max NA
#> GBM - NNet  0.022      0 0.15816550 -0.2 0.5  0
#> GBM - RF   -0.012      0 0.04797959 -0.1 0.1  0
#> NNet - RF  -0.034      0 0.14653829 -0.5 0.2  0
#> 
#> , , Brier
#> 
#>                    Mean        Median         SD         Min        Max NA
#> GBM - NNet -0.009892183  0.0014628834 0.12230934 -0.33301668 0.21469886  0
#> GBM - RF    0.016955715  0.0009175387 0.02937316 -0.04403306 0.09662101  0
#> NNet - RF   0.026847898 -0.0037110334 0.11589380 -0.15451161 0.33221333  0

t.test(perfdiff)
#> An object of class "HTestResamples"
#> 
#> Upper diagonal: mean differences (row - column)
#> Lower diagonal: p-values
#> P-value adjustment method: holm
#> 
#> , , Accuracy
#> 
#>            GBM       NNet           RF
#> GBM         NA 0.02266667 -0.009333333
#> NNet 0.2146777         NA -0.032000000
#> RF   0.1535959 0.15359593           NA
#> 
#> , , Kappa
#> 
#>            GBM      NNet     RF
#> GBM         NA 0.0220000 -0.012
#> NNet 0.3301691        NA -0.034
#> RF   0.2495969 0.2495969     NA
#> 
#> , , Brier
#> 
#>              GBM         NNet         RF
#> GBM           NA -0.009892183 0.01695572
#> NNet 0.570006478           NA 0.02684790
#> RF   0.000493236  0.215623205         NA

plot(perfdiff)

Ensemble Models

## Stacked regression
stackedres <- resample(fo, data = iris, model = StackedModel(GBMModel, RandomForestModel, NNetModel))
summary(stackedres)
#>                Mean     Median         SD         Min       Max NA
#> Accuracy 0.96000000 0.96666667 0.04661373 0.866666667 1.0000000  0
#> Kappa    0.94000000 0.95000000 0.06992059 0.800000000 1.0000000  0
#> Brier    0.06515169 0.06151907 0.06178104 0.005580933 0.1841152  0

## Super learner
superres <- resample(fo, data = iris, model = SuperModel(GBMModel, RandomForestModel, NNetModel))
summary(superres)
#>                Mean    Median         SD          Min      Max NA
#> Accuracy 0.94666667 0.9333333 0.04216370 0.8666666667 1.000000  0
#> Kappa    0.92000000 0.9000000 0.06324555 0.8000000000 1.000000  0
#> Brier    0.09841853 0.1106877 0.08111538 0.0000110332 0.259555  0

Calibration Curves

cal <- calibration(res)
plot(cal, se = TRUE)

Confusion Matrices

(conf <- confusion(gbmres, cutoff = NULL))
#> GBMModel :
#>             Observed
#> Predicted          setosa   versicolor    virginica
#>   setosa     249.23098573   0.24569404   0.09285259
#>   versicolor   0.75679330 230.60018692  25.22138229
#>   virginica    0.01222097  19.15411904 224.68576512

summary(conf)
#> GBMModel :
#> Number of responses: 750
#> Accuracy (SE): 0.9393559 (0.008715226)
#> Majority class: 0.3333333
#> Kappa: 0.9090339
#> 
#>                setosa versicolor virginica
#> Observed    0.3333333  0.3333333 0.3333333
#> Predicted   0.3327594  0.3421045 0.3251361
#> Agreement   0.3323080  0.3074669 0.2995810
#> Sensitivity 0.9969239  0.9224007 0.8987431
#> Specificity 0.9993229  0.9480436 0.9616673
#> PPV         0.9986435  0.8987515 0.9214018
#> NPV         0.9984633  0.9606831 0.9499865

plot(conf)

Partial Dependence Plots

pd <- dependence(gbmfit, select = c(Petal.Length, Petal.Width))
plot(pd)

Lift Curves

## Requires a binary outcome
fo_versicolor <- factor(Species == "versicolor") ~ .
control = CVControl()

gbmres_versicolor <- resample(fo_versicolor, data = iris,  model = GBMModel, control = control)
lf <- lift(gbmres_versicolor)
plot(lf)

rfres_versicolor <- resample(fo_versicolor, data = iris,  model = RandomForestModel, control = control)
nnetres_versicolor <- resample(fo_versicolor, data = iris,  model = NNetModel, control = control)

res_versicolor <- Resamples(gbmres_versicolor, rfres_versicolor, nnetres_versicolor)
lf <- lift(res_versicolor)
plot(lf, find = 75)

Preprocessing Recipes

library(recipes)

rec <- recipe(fo, data = iris) %>%
  add_role(Species, new_role = "case_strata") %>%
  step_center(all_predictors()) %>%
  step_scale(all_predictors()) %>%
  step_pca(all_predictors())

fit_rec <- fit(rec, model = GBMModel)
varimp(fit_rec)
#>        Overall
#> PC1 100.000000
#> PC3   5.734085
#> PC2   1.780191
#> PC4   0.000000

res_rec <- resample(rec, model = GBMModel, control = CVControl)
summary(res_rec)
#>               Mean     Median         SD         Min       Max NA
#> Accuracy 0.9400000 0.93333333 0.06629526 0.800000000 1.0000000  0
#> Kappa    0.9100000 0.90000000 0.09944289 0.700000000 1.0000000  0
#> Brier    0.0806202 0.05286238 0.08717373 0.001682667 0.2483078  0

zhaoxiaohe/MachineShop