sahirbhatnagar/casebase

Formatting casebase output to be consistent

Closed this issue · 5 comments

Here's a reproducible (as minimal as possible) example. The problem here is:

  • using logistic regression, i can pass new data using all columns.
  • using regularized logistic regression, I need to only pass the co-variate columns.

Going one at a time, Score should not cause any issues. Going all at once, however, we run into the issue of dimensions.

a formula interface may be beneficial here. That way, both regularized logistic regression and logistic regression can be compared.

Or, we have a predictRisk.penalizedSingleRisk function that processes the full test dataset? I think this would be the easiest solution since its all downstream, and only changes an additional function.

# Simulate censored survival data for two outcome types from exponential distributions
library(data.table)
#> Warning: package 'data.table' was built under R version 3.6.2
library(casebase)
#> See example usage at http://sahirbhatnagar.com/casebase/
library(survival)
#> Warning: package 'survival' was built under R version 3.6.2
library(riskRegression)
#> riskRegression version 2020.02.05
library(prodlim)
#> Warning: package 'prodlim' was built under R version 3.6.3
# event type 0-censored, 1-event of interest, 2-competing event
# t observed time/endpoint
# X1:X5 are binary covariates
# X6:X10 are continuous covariates
nobs <- 200
set.seed(12345)

### Using the same data, make it single event
### This way, we can debug using the faster prediction algorithm!

## DT <- sampleData(nobs, outcome = "competing.risk")
DT <- sampleData(nobs, outcome = "survival")
tlim <- 12

#Censoring
DT[time >= tlim, `:=`("event" = 0, "time" = tlim)]





### the call for prediction between single and competing risk is the same.
### for the sake of simplicity I'll be using the single event (its also faster for now)
predictRisk.singleRisk <- function(object, newdata, times, cause, ...) {
    

  # if (missing(cause)) stop("Argument cause should be the event type for which we predict the absolute risk.")
  # the output of absoluteRisk is an array with dimension dependening on the length of the requested times:
  # case 1: the number of time points is 1
  #         dim(array) =  (length(time), NROW(newdata), number of causes in the data)
  if (length(times) == 1) {
    a <- absoluteRisk(object, newdata = newdata, time = times)
    p <- matrix(a, ncol = 1)
  } else {
    # case 2 a) zero is included in the number of time points
    if (0 %in% times) {
      # dim(array) =  (length(time)+1, NROW(newdata)+1, number of causes in the data)
      a <- absoluteRisk(object, newdata = newdata, time = times)
      ### we need to invert the plot because, by default, we get cumulative incidence
      a[, -c(1)] <- 1 - a[, -c(1)]
      ### we remove time 0 predictions for everyone, and remove the time column
      a <- a[-c(1), -c(1)]
      # now we transpose the matrix because in riskRegression we work with number of
      # observations in rows and time points in columns
      p <- t(a)
    } else {
      # case 2 b) zero is not included in the number of time points (but the absoluteRisk function adds it)
      a <- absoluteRisk(object, newdata = newdata, time = times)
      ### we need to invert the plot because, by default, we get cumulative incidence
      a[, -c(1)] <- 1 - a[, -c(1)]
      ### we remove time 0 for everyone, and remove the time column
      a <- a[-c(1), -c(1)] ### a[-c(1), ] to keep times column, but remove time 0 probabilities
      # now we transpose the matrix because in riskRegression we work with number of
      # observations in rows and time points in columns
      p <- t(a)
    }
  }
  if (NROW(p) != NROW(newdata) || NCOL(p) != length(times)) {
    stop(paste("\nPrediction matrix has wrong dimensions:\nRequested newdata x times: ", NROW(newdata), " x ", length(times), "\nProvided prediction matrix: ", NROW(p), " x ", NCOL(p), "\n\n", sep = ""))
  }
  p
}


times <- c(2, 5, 8)

#logistic regression
simple_linear <- fitSmoothHazard(event ~ time + X1+X2+X3, DT)
#> 'time' will be used as the time variable
class(simple_linear) <- c("singleRisk",class(simple_linear))
pred_simple_linear <- predictRisk.singleRisk(object = simple_linear, times = times, newdata = DT, cause = 1)



#penalized logistic regression
# x and y will be used to fit the casebase model under glmnet.
y <- data.matrix(DT[, c(14, 13)])
x <- data.matrix(DT[, c(1,2,3)])
newData <- data.matrix(DT[, c(1,2,3)])
regularizedCB <- casebase::fitSmoothHazard.fit(x, y, family = "glmnet", time = "time", event = "event", alpha = 1, ratio = 10, standardize = TRUE)
class(regularizedCB) <- c("singleRisk",class(regularizedCB))
pred_Reg_CB<-predictRisk.singleRisk(object = regularizedCB, times = times, newdata =newData, cause = 1)

# compare with cause-specific Cox regression
# Fit a cox model
coxfit <- coxph(Surv(time, event != 0) ~ X1+X6+X7, data = DT, x = TRUE)
# Get Cox survivals
coxPred <- summary(survfit(coxfit, newdata = DT), times = times)$surv





# compare prediction performance on a test set
set.seed(8)
testDT <- sampleData(130, outcome = "survival")

results <- Score(list("logistic" = simple_linear, "PenalizedLogistic" = regularizedCB, "Cox" = coxfit),
  data = testDT, # data for evaluation
  formula = Hist(time, event) ~ 1, # used to communicate the outcome variables
  # and which variables affect censoring
  cause = 1, # cause of interest
  times = 5
) # evaluation time point
#> Error in cbind2(1, newx) %*% nbeta: Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90

results
#> Error in eval(expr, envir, enclos): object 'results' not found

sessionInfo()
#> R version 3.6.1 (2019-07-05)
#> Platform: x86_64-w64-mingw32/x64 (64-bit)
#> Running under: Windows 10 x64 (build 18363)
#> 
#> Matrix products: default
#> 
#> locale:
#> [1] LC_COLLATE=English_United States.1252 
#> [2] LC_CTYPE=English_United States.1252   
#> [3] LC_MONETARY=English_United States.1252
#> [4] LC_NUMERIC=C                          
#> [5] LC_TIME=English_United States.1252    
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#> [1] prodlim_2019.11.13        riskRegression_2020.02.05
#> [3] survival_3.1-8            casebase_0.2.1.9001      
#> [5] data.table_1.12.8        
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.3          mvtnorm_1.1-0       lattice_0.20-38    
#>  [4] zoo_1.8-7           png_0.1-7           glmnet_3.0-2       
#>  [7] assertthat_0.2.1    digest_0.6.25       foreach_1.4.8      
#> [10] R6_2.4.1            backports_1.1.5     acepack_1.4.1      
#> [13] MatrixModels_0.4-1  stats4_3.6.1        evaluate_0.14      
#> [16] ggplot2_3.3.0       highr_0.8           pillar_1.4.3       
#> [19] rlang_0.4.5         multcomp_1.4-12     rstudioapi_0.11    
#> [22] SparseM_1.78        rpart_4.1-15        Matrix_1.2-17      
#> [25] checkmate_2.0.0     rmarkdown_2.1       splines_3.6.1      
#> [28] stringr_1.4.0       foreign_0.8-71      htmlwidgets_1.5.1  
#> [31] munsell_0.5.0       numDeriv_2016.8-1.1 compiler_3.6.1     
#> [34] xfun_0.12           pkgconfig_2.0.3     base64enc_0.1-3    
#> [37] shape_1.4.4         mgcv_1.8-28         htmltools_0.4.0    
#> [40] nnet_7.3-12         tidyselect_1.0.0    tibble_2.1.3       
#> [43] gridExtra_2.3       htmlTable_1.13.3    Hmisc_4.3-1        
#> [46] rms_5.1-4           codetools_0.2-16    crayon_1.3.4       
#> [49] dplyr_0.8.3         timereg_1.9.4       MASS_7.3-51.4      
#> [52] grid_3.6.1          polspline_1.1.17    nlme_3.1-140       
#> [55] gtable_0.3.0        lifecycle_0.2.0     magrittr_1.5       
#> [58] scales_1.1.0        stringi_1.4.6       latticeExtra_0.6-29
#> [61] sandwich_2.5-1      TH.data_1.0-10      Formula_1.2-3      
#> [64] lava_1.6.7          RColorBrewer_1.1-2  iterators_1.0.12   
#> [67] tools_3.6.1         cmprsk_2.2-9        glue_1.3.1         
#> [70] purrr_0.3.3         jpeg_0.1-8.1        yaml_2.2.1         
#> [73] colorspace_1.4-1    cluster_2.1.0       VGAM_1.1-2         
#> [76] knitr_1.28          quantreg_5.54

Created on 2020-03-27 by the reprex package (v0.3.0)

Here's what I understand we need to do in casebase:

  • Add a class (e.g. CBsingleEvent or singleRisk) to the output of fitSmoothHazard and fitSmoothHazard.fit.

  • Add two new parameters to absoluteRisk: one that controls whether we add zero and order the times (which should be TRUE by default) and one that controls whether the output is on the CI or the survival scale.

I think the issue with passing one vs. all columns in penalized regression should be handled by the function predictRisk.singleRisk, which will be part of the riskRegression package. We do some of this gymnastic in the casebase package, but it's intrinsically a consequence of how glmnet works.

And to show that it's at its heart an issue with glmnet:

library(glmnet)
#> Loading required package: Matrix
#> Loaded glmnet 3.0-2
x=matrix(rnorm(100*20),100,20)
y=rnorm(100)
fit1=glmnet(x,y)
# Repeat first column of x
predict(fit1,newx=x[1:5,c(1,1:20)],s=c(0.01,0.005))
#> Error in cbind2(1, newx) %*% nbeta: Cholmod error 'X and/or Y have wrong dimensions' at file ../MatrixOps/cholmod_sdmult.c, line 90

Created on 2020-03-27 by the reprex package (v0.3.0)

How should we let predictRisk.singleRisk know whether we used glm or glmnet? Or just make two predictRisk functions?

absoluteRisk knows how to handle glm and glmnet differently, and it gives an output in the same format regardless. Which is what predictRisk.singleRisk expects.

The function fitSmoothHazard.fit was created to allow a matrix interface. The trade-off is that absoluteRisk also needs to be given a matrix when using it with the output of fitSmoothHazard.fit. This is consistent with how glmnet works.

I guess there are a few different ways to address this issue, but the main point is we need a way to transform the data.frame that Score or predictRisk receives and turn it into a matrix that absoluteRisk uses. Then everything else should work smoothly

I made the distinction in predictRisk, and it seems to be working! I can pass both predict.glmnet() formatted data and unprocessed data to the predictRisk.singleRisk function.

These are still remaining to be done:

Here's what I understand we need to do in casebase:

* [ ]  Add a class (e.g. `CBsingleEvent` or `singleRisk`) to the output of `fitSmoothHazard` and `fitSmoothHazard.fit`.

* [ ]  Add two new parameters to `absoluteRisk`: one that controls whether we add zero and order the times (which should be `TRUE` by default) and one that controls whether the output is on the CI or the survival scale.

I think the issue with passing one vs. all columns in penalized regression should be handled by the function predictRisk.singleRisk, which will be part of the riskRegression package. We do some of this gymnastic in the casebase package, but it's intrinsically a consequence of how glmnet works.

# Simulate censored survival data for two outcome types from exponential distributions
library(data.table)
#> Warning: package 'data.table' was built under R version 3.6.2
library(casebase)
#> See example usage at http://sahirbhatnagar.com/casebase/
library(survival)
#> Warning: package 'survival' was built under R version 3.6.2
library(riskRegression)
#> riskRegression version 2020.02.05
library(prodlim)
#> Warning: package 'prodlim' was built under R version 3.6.3
# event type 0-censored, 1-event of interest, 2-competing event
# t observed time/endpoint
# X1:X5 are binary covariates
# X6:X10 are continuous covariates
nobs <- 200
set.seed(12345)

### Using the same data, make it single event
### This way, we can debug using the faster prediction algorithm!

## DT <- sampleData(nobs, outcome = "competing.risk")
DT <- sampleData(nobs, outcome = "survival")
tlim <- 12

#Censoring
DT[time >= tlim, `:=`("event" = 0, "time" = tlim)]





### the call for prediction between single and competing risk is the same.
### for the sake of simplicity I'll be using the single event (its also faster for now)
predictRisk.singleRisk <- function(object, newdata, times, cause, ...) {
  if (object$call[[1]]!="glm"){
    #get all covariates excluding intercept and time
    coVars=colnames(object$originalData$x)
    #coVars is used in lines 44 and 50
    newdata=data.matrix(drop(subset(newdata, select=coVars)))
  }

  # if (missing(cause)) stop("Argument cause should be the event type for which we predict the absolute risk.")
  # the output of absoluteRisk is an array with dimension dependening on the length of the requested times:
  # case 1: the number of time points is 1
  #         dim(array) =  (length(time), NROW(newdata), number of causes in the data)
  if (length(times) == 1) {
    a <- absoluteRisk(object, newdata = newdata, time = times)
    p <- matrix(a, ncol = 1)
  } else {
    # case 2 a) zero is included in the number of time points
    if (0 %in% times) {
      # dim(array) =  (length(time)+1, NROW(newdata)+1, number of causes in the data)
      a <- absoluteRisk(object, newdata = newdata, time = times)
      p <- t(a)
    } else {
      # case 2 b) zero is not included in the number of time points (but the absoluteRisk function adds it)
      a <- absoluteRisk(object, newdata = newdata, time = times)
      ### we need to invert the plot because, by default, we get cumulative incidence
      a[, -c(1)] <- 1 - a[, -c(1)]
      ### we remove time 0 for everyone, and remove the time column
      a <- a[-c(1), -c(1)] ### a[-c(1), ] to keep times column, but remove time 0 probabilities
      # now we transpose the matrix because in riskRegression we work with number of
      # observations in rows and time points in columns
      p <- t(a)
    }
  }
  if (NROW(p) != NROW(newdata) || NCOL(p) != length(times)) {
    stop(paste("\nPrediction matrix has wrong dimensions:\nRequested newdata x times: ", NROW(newdata), " x ", length(times), "\nProvided prediction matrix: ", NROW(p), " x ", NCOL(p), "\n\n", sep = ""))
  }
  p
}


times <- c(2, 5, 8)

#logistic regression
simple_linear <- fitSmoothHazard(event ~ time + X1+X2+X3, DT)
#> 'time' will be used as the time variable
class(simple_linear) <- c("singleRisk",class(simple_linear))
pred_simple_linear <- predictRisk.singleRisk(object = simple_linear, times = times, newdata = DT, cause = 1)



#penalized logistic regression
# x and y will be used to fit the casebase model under glmnet.
y <- data.matrix(DT[, c(14, 13)])
x <- data.matrix(DT[, c(1,2,3)])
regularizedCB <- casebase::fitSmoothHazard.fit(x, y, family = "glmnet", time = "time", event = "event", alpha = 1, ratio = 10, standardize = TRUE)
class(regularizedCB) <- c("singleRisk",class(regularizedCB))
pred_Reg_CB<-predictRisk.singleRisk(object = regularizedCB, times = times, newdata =DT, cause = 1)

pred_Reg_CB<-predictRisk.singleRisk(object = regularizedCB, times = times, newdata =x, cause = 1)


# compare with cause-specific Cox regression
# Fit a cox model
coxfit <- coxph(Surv(time, event != 0) ~ X1+X6+X7, data = DT, x = TRUE)
# Get Cox survivals
coxPred <- summary(survfit(coxfit, newdata = DT), times = times)$surv



# compare prediction performance on a test set
set.seed(8)
testDT <- sampleData(130, outcome = "survival")

results <- Score(list("Logist" = simple_linear, "Penal.Logist" = regularizedCB, "Cox" = coxfit),
  data = testDT, # data for evaluation
  formula = Hist(time, event) ~ 1, # used to communicate the outcome variables
  # and which variables affect censoring
  cause = 1, # cause of interest
  times = 5
) # evaluation time point

results
#> 
#> Metric AUC:
#> 
#> Results by model:
#> 
#>           model times  AUC lower upper
#> 1:       Logist     5 62.6  52.8  72.5
#> 2: Penal.Logist     5 79.2  70.7  87.7
#> 3:          Cox     5 86.7  79.8  93.6
#> 
#> Results of model comparisons:
#> 
#>    times        model    reference delta.AUC lower upper     p
#> 1:     5 Penal.Logist       Logist      16.5   3.4  29.7 1e-02
#> 2:     5          Cox       Logist      24.1  13.1  35.1 2e-05
#> 3:     5          Cox Penal.Logist       7.5   2.0  13.1 8e-03
#> 
#> NOTE: Values are multiplied by 100 and given in % (use print(...,percent=FALSE) to avoid this.
#> 
#> NOTE: The higher AUC the better.
#> 
#> Metric Brier:
#> 
#> Results by model:
#> 
#>           model times Brier lower upper
#> 1:   Null model     5  24.2  22.6  25.8
#> 2:       Logist     5  23.1  20.3  25.9
#> 3: Penal.Logist     5  19.7  17.7  21.8
#> 4:          Cox     5  13.6  10.5  16.8
#> 
#> Results of model comparisons:
#> 
#>    times        model    reference delta.Brier lower upper            p
#> 1:     5       Logist   Null model        -1.1  -3.2   1.0 3.087592e-01
#> 2:     5 Penal.Logist   Null model        -4.4  -6.0  -2.8 5.555318e-08
#> 3:     5          Cox   Null model       -10.5 -13.7  -7.4 1.003926e-10
#> 4:     5 Penal.Logist       Logist        -3.3  -6.1  -0.6 1.695679e-02
#> 5:     5          Cox       Logist        -9.5 -12.9  -6.0 1.058649e-07
#> 6:     5          Cox Penal.Logist        -6.1  -8.4  -3.9 9.330403e-08
#> 
#> NOTE: Values are multiplied by 100 and given in % (use print(...,percent=FALSE) to avoid this.
#> 
#> NOTE: The lower Brier the better.

Created on 2020-03-27 by the reprex package (v0.3.0)