HannaMeyer/CAST

Classification theshold

Closed this issue · 1 comments

Hello,

I've been reading through the documentation for the ffs function, and I haven't been able to figure out a way to change what the threshold is for classifying predicted values. Am I missing something, or is this a missing feature? Perhaps the issue stems more from the caret package, but it is with the ffs function that I run into issues.

Here is an example of problem I'm having:

library(caret)
library(CAST)

test_data <- structure(list(presence = c("no", "yes", "no", "no", "no", "no", 
"yes", "no", "no", "no", "no", "no", "no", "yes", "no", "no", 
"no", "no", "no", "no", "yes", "no", "no", "yes", "yes", "yes", 
"no", "no", "no", "no", "yes", "no", "no", "no", "no", "no", 
"no", "no", "no", "no", "no", "no", "no", "no", "no", "yes", 
"no", "no", "no", "no", "yes", "no", "no", "yes", "no", "no", 
"yes", "no", "no", "no", "no", "no", "no", "no", "no", "no", 
"no", "no", "no", "no", "no", "no", "no", "no", "no", "no", "no", 
"no", "no", "no", "yes", "no", "yes", "no", "no", "no", "no", 
"no", "yes", "no", "no", "no", "no", "no", "yes", "no", "no", 
"no", "no", "no"), annual_precip = c(153L, 200L, 235L, 281L, 
296L, 200L, 130L, 127L, 294L, 169L, 221L, 242L, 105L, 173L, 420L, 
212L, 116L, 252L, 153L, 167L, 243L, 186L, 412L, 179L, 237L, 107L, 
147L, 231L, 157L, 286L, 185L, 154L, 176L, 205L, 84L, 209L, 87L, 
247L, 380L, 146L, 218L, 119L, 420L, 420L, 200L, 195L, 199L, 411L, 
419L, 188L, 127L, 156L, 108L, 195L, 183L, 397L, 152L, 122L, 148L, 
152L, 219L, 159L, 152L, 107L, 367L, 393L, 115L, 252L, 241L, 169L, 
297L, 310L, 199L, 147L, 142L, 226L, 118L, 289L, 246L, 237L, 153L, 
113L, 203L, 220L, 76L, 101L, 346L, 133L, 154L, 305L, 156L, 233L, 
442L, 130L, 125L, 127L, 117L, 199L, 211L, 109L), precip_wettest_Q = c(74L, 
92L, 118L, 125L, 130L, 85L, 69L, 61L, 142L, 84L, 104L, 104L, 
54L, 84L, 183L, 101L, 56L, 125L, 74L, 75L, 112L, 91L, 175L, 94L, 
102L, 58L, 75L, 102L, 87L, 125L, 91L, 76L, 86L, 115L, 46L, 100L, 
45L, 123L, 156L, 69L, 103L, 66L, 172L, 183L, 85L, 100L, 85L, 
169L, 177L, 92L, 70L, 75L, 53L, 95L, 88L, 167L, 74L, 66L, 71L, 
75L, 104L, 78L, 75L, 62L, 153L, 162L, 70L, 124L, 121L, 82L, 140L, 
143L, 84L, 75L, 70L, 113L, 65L, 127L, 122L, 108L, 75L, 61L, 94L, 
116L, 39L, 52L, 136L, 66L, 75L, 129L, 76L, 117L, 183L, 69L, 64L, 
63L, 65L, 92L, 94L, 55L), mean_diurnal_range = c(7L, 7L, 7L, 
9L, 5L, 7L, 7L, 6L, 7L, 7L, 7L, 6L, 6L, 7L, 6L, 8L, 6L, 8L, 7L, 
7L, 6L, 7L, 5L, 7L, 6L, 6L, 7L, 7L, 6L, 9L, 7L, 6L, 8L, 6L, 5L, 
7L, 6L, 7L, 4L, 6L, 7L, 7L, 4L, 6L, 7L, 7L, 8L, 4L, 5L, 7L, 7L, 
7L, 6L, 7L, 7L, 5L, 6L, 6L, 7L, 6L, 6L, 7L, 6L, 5L, 4L, 4L, 6L, 
7L, 8L, 7L, 7L, 7L, 8L, 7L, 7L, 6L, 7L, 8L, 8L, 7L, 7L, 6L, 7L, 
6L, 5L, 6L, 4L, 6L, 7L, 6L, 6L, 7L, 4L, 7L, 6L, 7L, 6L, 7L, 7L, 
6L), isothermality = c(14L, 13L, 14L, 17L, 11L, 16L, 13L, 14L, 
15L, 14L, 14L, 14L, 14L, 14L, 13L, 16L, 14L, 15L, 14L, 16L, 14L, 
14L, 12L, 14L, 13L, 14L, 15L, 14L, 14L, 17L, 14L, 13L, 15L, 15L, 
13L, 16L, 13L, 14L, 10L, 13L, 14L, 15L, 11L, 13L, 16L, 15L, 16L, 
10L, 11L, 14L, 14L, 14L, 14L, 14L, 14L, 11L, 13L, 14L, 14L, 13L, 
14L, 14L, 13L, 12L, 10L, 11L, 12L, 14L, 15L, 14L, 14L, 14L, 16L, 
15L, 14L, 13L, 14L, 16L, 15L, 14L, 14L, 14L, 13L, 14L, 13L, 13L, 
11L, 14L, 14L, 14L, 13L, 14L, 11L, 15L, 13L, 14L, 13L, 13L, 14L, 
14L)), row.names = c(NA, -100L), class = c("tbl_df", "tbl", "data.frame"
))

If I create a model using all 4 predictors variables, the Kappa statistic calculated by the train function is 0.08

set.seed(2354)

model <- train(presence ~ .,
            trControl = trControlCon, 
            method = 'glm', 
            family = 'binomial', 
            metric = 'Kappa',  
            data = test_data
)

Generalized Linear Model

100 samples
4 predictor
2 classes: 'no', 'yes'

No pre-processing
Resampling: Cross-Validated (3 fold)
Summary of sample sizes: 66, 67, 67
Resampling results:

Accuracy Kappa
0.8404635 0.08159167

However, this is based on a default threshold of 0.5. According the calculations below however, the threshold that would maximize Kappa is 0.2.

dt <- model$pred[,c("rowIndex", "obs", "yes")] %>%
  arrange(rowIndex) %>%
  mutate(obs = ifelse(obs == "yes", TRUE, FALSE),
         rowIndex = as.character(rowIndex))

ths <-  optimal.thresholds(dt, opt.methods	= "MaxKappa")

Method yes
MaxKappa 0.2

If I calculate Kappa using this threshold, I estimate a much higher Kappa statistic of 0.3.

cmx_test <- cmx(dt, ths$yes[1])

Kappa(cmx_test)

Kappa Kappa.sd
0.2412141 0.1228811

This becomes an issues when I try to use the ffs function, because I am running into many instances where when I'm using Kappa as the metric for variable selection, all of the 2 and 3 variable combinations have a Kappa statistic of 0 or less, so the algorithm stops. When this happens, all of the predicted values are returned as "no", because the probability of "yes" is less than 0.5 for all of the observations. However, if the threshold had been 0.2 instead of 0.5, I suspect the Kappa value would have varied more between predictor combinations, and likely more variables would be selected.

FF <- ffs(predictors = test_data[,2:5], 
             response = test_data$presence,
             trControl = trainControl(method = 'cv', number = 3, classProbs = TRUE,  
                                      savePredictions = TRUE),
             minVar = 2,
             method = 'glm', 
             family = 'binomial',
             metric = "Kappa"
          
  )

FF$perf_all
var1 var2 var3 Kappa SE nvar
1 annual_precip precip_wettest_Q 0.00000000 0.00000000 2
2 annual_precip mean_diurnal_range 0.00000000 0.00000000 2
3 annual_precip isothermality 0.00000000 0.00000000 2
4 precip_wettest_Q mean_diurnal_range -0.01587302 0.01587302 2
5 precip_wettest_Q isothermality 0.00000000 0.00000000 2
6 mean_diurnal_range isothermality 0.00000000 0.00000000 2
7 annual_precip precip_wettest_Q mean_diurnal_range -0.01587302 0.01587302 3
8 annual_precip precip_wettest_Q isothermality 0.00000000 0.00000000 3

Do you have any suggestions as to how I could customize the threshold used to calculate the classification metrics for ffs to avoid this issue?

After discovering this thread (topepo/caret#224), it seems that a custom function is the only answer. Here is an example of a custom function that can address this issue:

library(PresenceAbsence)

kappa_custom <- function(data, lev = NULL, model = NULL) {
  
  dt <- data[,c("rowIndex", "obs", "yes")] %>%
    arrange(rowIndex) %>%
    mutate(obs = ifelse(obs == "yes", TRUE, FALSE),
           rowIndex = as.character(rowIndex))
  
  
  ths <-  optimal.thresholds(dt, opt.methods	= "MaxKappa")
  
  
  cmx_test <- cmx(dt, ths$yes[1])
  
  
  k <- Kappa(cmx_test)
  
  c(Kappa_Custom = k[,1])
  
}




FF2 <- ffs(predictors = test_data[,2:5], 
           response = test_data$presence,
           trControl = trainControl(method = 'cv', number = 3, classProbs = TRUE,  
                                    savePredictions = TRUE, summaryFunction = kappa_custom),
           minVar = 2,
           method = 'glm', 
           family = 'binomial',
           metric = "Kappa_Custom"
           
)