chjackson/flexsurv

`sim.fmsm()` different results with `tidy=TRUE` vs `tidy=FALSE`

Opened this issue · 2 comments

mafed commented

The MRE below shows that only uncensored paths are returned when tidy=TRUE.
Is this on purpose @chjackson? I couldn't find reference to this behaviour in the help page.
I think this is created by simfs_bytrans which is called when tidy=TRUE.

library(mstate)
library(flexsurv)

## take data from the 'mstate' tutorial
data("ebmt3")
tmat <- trans.illdeath(names = c("Tx", "PR", "RelDeath"))
covs <- c("dissub", "age", "drmatch", "tcd", "prtime")
msbmt <- msprep(time = c(NA, "prtime", "rfstime"), 
                status = c(NA, "prstat", "rfsstat"), 
                data = ebmt3, 
                trans = tmat, 
                keep = covs)
msbmt <- expand.covs(msbmt, covs, append = TRUE, longnames = FALSE)

## add time-varying covariate for PH transition, ref page 8 of tutorial
msbmt$pr <- msbmt$trans == 3
msbmt$to <- as.factor(msbmt$to)
msbmt$trans <- as.factor(msbmt$trans)
table(msbmt$trans)
table(msbmt$to)

## fit weibull model with 'same' formula
flex_clock_reset <- flexsurvreg(
  Surv(time, status) ~ 
    dissub1.1 + dissub2.1 +
    age1.1 +    age2.1 + drmatch.1 + tcd.1 + dissub1.2 + dissub2.2 +
    age1.2 +    age2.2 + drmatch.2 + tcd.2 + dissub1.3 + dissub2.3 +
    age1.3 +    age2.3 + drmatch.3 + tcd.3 + 
    trans + shape(to), 
  data = msbmt, dist = "weibull")
flex_clock_reset

## preparing newdata
newd <- data.frame(dissub = rep(0, 3), 
                   age = rep(0, 3), 
                   drmatch = rep(0, 3), 
                   tcd = rep(0, 3), 
                   trans = as.factor(1:3))
newd$dissub <- factor(newd$dissub, 
                      levels = 0:2, 
                      labels = levels(ebmt3$dissub))
newd$age <- factor(newd$age, 
                   levels = 0:2, 
                   labels = levels(ebmt3$age))
newd$drmatch <- factor(newd$drmatch, 
                       levels = 0:1, 
                       labels = levels(ebmt3$drmatch))
newd$tcd <- factor(newd$tcd, 
                   levels = 0:1, 
                   labels = levels(ebmt3$tcd))
attr(newd, "trans") <- tmat
class(newd) <- c("msdata", "data.frame")
newd <- expand.covs(newd, covs[1:4], longnames = FALSE)

newd$to     <- as.factor(c(2, 3, 3))    # same shape for trans 2 and 3

set.seed(202306)
invisible(rnorm(1e3))
sim_paths_messy <- 
  sim.fmsm(flex_clock_reset,
           t = 3, 
           newdata = newd,
           M = 10,
           tvar = "trans",
           trans = tmat, 
           tidy = FALSE)
nrow(sim_paths_messy$t)

set.seed(202306)
invisible(rnorm(1e3))
sim_paths_tidy <- 
  sim.fmsm(flex_clock_reset,
           t = 3, 
           newdata = newd,
           M = 10,
           tvar = "trans",
           trans = tmat, 
           tidy = TRUE)
nrow(sim_paths_tidy)

Thanks for this report - yes, one would expect there to be equivalent information in the messy and tidy results. For the moment I will at least fix the documentation to explain that the tidy results exclude the censoring times, but I will leave the issue open, because ideally I think they should be included somehow.

Though to implement that, I'm not sure whether end should equal start on these occasions, or end=NA, or to have multiple rows for each censoring time, corresponding to censored times to each potential competing event, with a corresponding status indicator that is 1 for an event time and 0 for a censoring time. The latter would be useful if you wanted to fit a multi-state model to the simulated data.

mafed commented

This in the meantime works for my specific case, but it just set the end = NA and I didn't extensively debugged it.
Though I like your last proposal better and will work on it.

tidy_sim.fmsm <- function(simfs) {
  trans <- attr(simfs, "trans")
  start <- which(apply(trans, 1, function(x) any(!is.na(x))))
  res <- NULL
  id_vector <- seq_len(nrow(simfs$st))
  
  ## for each starting state
  for (i in seq_along(start)) {
    subject_i <- NULL
    end_i <- which(!is.na(trans[start[i], ]))
    
    ## for each next state
    for (j in seq_len(ncol(simfs$st) - 1)) {
      ## break down the following conditions
      match_start <- simfs$st[, j] == start[i]
      match_end <- simfs$st[, j + 1] %in% end_i
      same_end <- simfs$st[, j + 1] == start[i]
      matching_subjects <- match_start & match_end
      censored_subjects <- match_start & same_end
      
      if (any(matching_subjects)) {
        subject_j <- 
          data.frame(id = id_vector[which(matching_subjects)],
                     end = simfs$st[matching_subjects, j + 1],
                     time = simfs$t[matching_subjects, j + 1],
                     delay = simfs$t[matching_subjects, j + 1] - 
                       simfs$t[matching_subjects, j],
                     status = 1L)
        subject_i <- rbind(subject_i, subject_j)
      }# END - if: matching_subjects
      
      if (any(censored_subjects)) {
        cens_j <- 
          data.frame(id = id_vector[which(censored_subjects)],
                     end  = simfs$st[censored_subjects, j + 1],
                     time  = simfs$t[censored_subjects, j + 1],
                     delay = simfs$t[censored_subjects, j + 1] - 
                       simfs$t[censored_subjects, j],
                     status = 0L)
        subject_i <- rbind(subject_i, cens_j)
      }# END - if: censored_subjects
      
    }# END - for: j
    
    if (nrow(subject_i) > 0) {
      subject_i$start <- start[i]
      res <- rbind(res, subject_i)
      simfs$st <- simfs$st[!censored_subjects, ]
      simfs$t <- simfs$t[!censored_subjects, ]
      id_vector <- id_vector[!censored_subjects]
    }# END - if
  }# END - for: i
  
  ## remove subjects already censored one step before
  res <- res[res$delay > 0, ]
  
  res$start <- as.character(res$start)
  res$end <- as.character(res$end)
  which_censored <- which(res$start == res$end)
  res$end[which_censored] <- NA_character_
  res$trans <- paste(res$start, res$end, sep = " - ")
  res <- res[, c("id", "start", "end", "trans", 
                 "time", "delay", "status")]
  attr(res, "trans") <- trans
  attr(res, "statenames") <- colnames(trans)
  
  res <- res[order(res$id, res$start), ]
  rownames(res) <- seq_len(nrow(res))
  res
}# END - tidy_sim.fmsm