`sim.fmsm()` different results with `tidy=TRUE` vs `tidy=FALSE`
Opened this issue · 2 comments
The MRE below shows that only uncensored paths are returned when tidy=TRUE
.
Is this on purpose @chjackson? I couldn't find reference to this behaviour in the help page.
I think this is created by simfs_bytrans
which is called when tidy=TRUE
.
library(mstate)
library(flexsurv)
## take data from the 'mstate' tutorial
data("ebmt3")
tmat <- trans.illdeath(names = c("Tx", "PR", "RelDeath"))
covs <- c("dissub", "age", "drmatch", "tcd", "prtime")
msbmt <- msprep(time = c(NA, "prtime", "rfstime"),
status = c(NA, "prstat", "rfsstat"),
data = ebmt3,
trans = tmat,
keep = covs)
msbmt <- expand.covs(msbmt, covs, append = TRUE, longnames = FALSE)
## add time-varying covariate for PH transition, ref page 8 of tutorial
msbmt$pr <- msbmt$trans == 3
msbmt$to <- as.factor(msbmt$to)
msbmt$trans <- as.factor(msbmt$trans)
table(msbmt$trans)
table(msbmt$to)
## fit weibull model with 'same' formula
flex_clock_reset <- flexsurvreg(
Surv(time, status) ~
dissub1.1 + dissub2.1 +
age1.1 + age2.1 + drmatch.1 + tcd.1 + dissub1.2 + dissub2.2 +
age1.2 + age2.2 + drmatch.2 + tcd.2 + dissub1.3 + dissub2.3 +
age1.3 + age2.3 + drmatch.3 + tcd.3 +
trans + shape(to),
data = msbmt, dist = "weibull")
flex_clock_reset
## preparing newdata
newd <- data.frame(dissub = rep(0, 3),
age = rep(0, 3),
drmatch = rep(0, 3),
tcd = rep(0, 3),
trans = as.factor(1:3))
newd$dissub <- factor(newd$dissub,
levels = 0:2,
labels = levels(ebmt3$dissub))
newd$age <- factor(newd$age,
levels = 0:2,
labels = levels(ebmt3$age))
newd$drmatch <- factor(newd$drmatch,
levels = 0:1,
labels = levels(ebmt3$drmatch))
newd$tcd <- factor(newd$tcd,
levels = 0:1,
labels = levels(ebmt3$tcd))
attr(newd, "trans") <- tmat
class(newd) <- c("msdata", "data.frame")
newd <- expand.covs(newd, covs[1:4], longnames = FALSE)
newd$to <- as.factor(c(2, 3, 3)) # same shape for trans 2 and 3
set.seed(202306)
invisible(rnorm(1e3))
sim_paths_messy <-
sim.fmsm(flex_clock_reset,
t = 3,
newdata = newd,
M = 10,
tvar = "trans",
trans = tmat,
tidy = FALSE)
nrow(sim_paths_messy$t)
set.seed(202306)
invisible(rnorm(1e3))
sim_paths_tidy <-
sim.fmsm(flex_clock_reset,
t = 3,
newdata = newd,
M = 10,
tvar = "trans",
trans = tmat,
tidy = TRUE)
nrow(sim_paths_tidy)
Thanks for this report - yes, one would expect there to be equivalent information in the messy and tidy results. For the moment I will at least fix the documentation to explain that the tidy results exclude the censoring times, but I will leave the issue open, because ideally I think they should be included somehow.
Though to implement that, I'm not sure whether end
should equal start
on these occasions, or end=NA
, or to have multiple rows for each censoring time, corresponding to censored times to each potential competing event, with a corresponding status
indicator that is 1 for an event time and 0 for a censoring time. The latter would be useful if you wanted to fit a multi-state model to the simulated data.
This in the meantime works for my specific case, but it just set the end = NA
and I didn't extensively debugged it.
Though I like your last proposal better and will work on it.
tidy_sim.fmsm <- function(simfs) {
trans <- attr(simfs, "trans")
start <- which(apply(trans, 1, function(x) any(!is.na(x))))
res <- NULL
id_vector <- seq_len(nrow(simfs$st))
## for each starting state
for (i in seq_along(start)) {
subject_i <- NULL
end_i <- which(!is.na(trans[start[i], ]))
## for each next state
for (j in seq_len(ncol(simfs$st) - 1)) {
## break down the following conditions
match_start <- simfs$st[, j] == start[i]
match_end <- simfs$st[, j + 1] %in% end_i
same_end <- simfs$st[, j + 1] == start[i]
matching_subjects <- match_start & match_end
censored_subjects <- match_start & same_end
if (any(matching_subjects)) {
subject_j <-
data.frame(id = id_vector[which(matching_subjects)],
end = simfs$st[matching_subjects, j + 1],
time = simfs$t[matching_subjects, j + 1],
delay = simfs$t[matching_subjects, j + 1] -
simfs$t[matching_subjects, j],
status = 1L)
subject_i <- rbind(subject_i, subject_j)
}# END - if: matching_subjects
if (any(censored_subjects)) {
cens_j <-
data.frame(id = id_vector[which(censored_subjects)],
end = simfs$st[censored_subjects, j + 1],
time = simfs$t[censored_subjects, j + 1],
delay = simfs$t[censored_subjects, j + 1] -
simfs$t[censored_subjects, j],
status = 0L)
subject_i <- rbind(subject_i, cens_j)
}# END - if: censored_subjects
}# END - for: j
if (nrow(subject_i) > 0) {
subject_i$start <- start[i]
res <- rbind(res, subject_i)
simfs$st <- simfs$st[!censored_subjects, ]
simfs$t <- simfs$t[!censored_subjects, ]
id_vector <- id_vector[!censored_subjects]
}# END - if
}# END - for: i
## remove subjects already censored one step before
res <- res[res$delay > 0, ]
res$start <- as.character(res$start)
res$end <- as.character(res$end)
which_censored <- which(res$start == res$end)
res$end[which_censored] <- NA_character_
res$trans <- paste(res$start, res$end, sep = " - ")
res <- res[, c("id", "start", "end", "trans",
"time", "delay", "status")]
attr(res, "trans") <- trans
attr(res, "statenames") <- colnames(trans)
res <- res[order(res$id, res$start), ]
rownames(res) <- seq_len(nrow(res))
res
}# END - tidy_sim.fmsm