Nothing
### R code from vignette source 'simstudy_survival.Rnw'
### Encoding: UTF-8
###################################################
### code chunk number 1: simstudy_survival.Rnw:13-43
###################################################
# Chunk 1
library(CALIBERrfimpute)
library(missForest)
library(survival)
library(xtable)
library(rpart)
library(mice)
library(ranger)
kPmiss <- 0.2 # probability of missingness
kLogHR <- 0.5 # true log hazard ratio
# To analyse samples of more than 200 patients, (recommend about 2000,
# but this will slow down the program), set NPATS before running
# this vignette.
if (!exists('NPATS')){
kSampleSize <- 200 # number of patients in simulated datasets
} else {
kSampleSize <- NPATS
}
# e.g.
# NPATS <- 2000
# To analyse more than 3 samples, set N to a number greater than 3
# e.g.
# N <- 1000
# To use more than 4 imputations, set NIMPS to a number greater than 4
# e.g.
# NIMPS <- 10
###################################################
### code chunk number 2: simstudy_survival.Rnw:86-280
###################################################
# Chunk 2
#### DATA GENERATING FUNCTIONS ####
makeSurv <- function(n = 2000, loghr = kLogHR){
# Creates a survival cohort of n patients. Assumes that censoring is
# independent of all other variables
# x1 and x2 are random normal variables
data <- data.frame(x1 = rnorm(n), x2 = rnorm(n))
# Create the x3 variable
data$x3 <- 0.5 * (data$x1 + data$x2 - data$x1 * data$x2) + rnorm(n)
# Underlying log hazard ratio for all variables is the same
data$y <- with(data, loghr * (x1 + x2 + x3))
data$survtime <- rexp(n, exp(data$y))
# Censoring - assume uniform distribution of observation times
# up to a maximum
obstime <- runif(nrow(data), min = 0,
max = quantile(data$survtime, 0.5))
data$event <- as.integer(data$survtime <= obstime)
# Generate integer survival times
data$time <- ceiling(100 * pmin(data$survtime, obstime))
# Observed marginal cumulative hazard for imputation models
data$cumhaz <- nelsonaalen(data, time, event)
# True log hazard and survival time are not seen in the data
# so remove them
data$y <- NULL
data$survtime <- NULL
return(data)
}
makeMarSurv <- function(data, pmissing = kPmiss){
# Introduces missing data dependent on event indicator
# and cumulative hazard and x1 and x2
logistic <- function(x){
exp(x) / (1 + exp(x))
}
predictions <- function(lp, n){
# uses the vector of linear predictions (lp) from a logistic model
# and the expected number of positive responses (n) to generate
# a set of predictions by modifying the baseline
trialn <- function(lptrial){
sum(logistic(lptrial))
}
stepsize <- 32
lptrial <- lp
# To avoid errors due to missing linear predictors (ideally
# there should not be any missing), replace with the mean
if (any(is.na(lptrial))){
lp[is.na(lptrial)] <- mean(lptrial, na.rm = TRUE)
}
while(abs(trialn(lptrial) - n) > 1){
if (trialn(lptrial) > n){
# trialn bigger than required
lptrial <- lptrial - stepsize
} else {
lptrial <- lptrial + stepsize
}
stepsize <- stepsize / 2
}
# Generate predictions from binomial distribution
as.logical(rbinom(logical(length(lp)), 1, logistic(lptrial)))
}
data$x3[predictions(0.1 * data$x1 + 0.1 * data$x2 +
0.1 * data$cumhaz + 0.1 * data$event, nrow(data) * pmissing)] <- NA
return(data)
}
#### IMPUTATION FUNCTIONS FROM DOOVE AND VAN BUUREN ####
mice.impute.rfdoove10 <- function(y, ry, x, ...){
mice::mice.impute.rf(y = y, ry = ry, x = x, ntrees = 10)
}
mice.impute.rfdoove100 <- function(y, ry, x, ...){
mice::mice.impute.rf(y = y, ry = ry, x = x, ntrees = 100)
}
#### OUR MICE RANDOM FOREST FUNCTIONS ####
mice.impute.rfcont5 <- function(y, ry, x, ...){
CALIBERrfimpute::mice.impute.rfcont(
y = y, ry = ry, x = x, ntree_cont = 5)
}
mice.impute.rfcont10 <- function(y, ry, x, ...){
CALIBERrfimpute::mice.impute.rfcont(
y = y, ry = ry, x = x, ntree_cont = 10)
}
mice.impute.rfcont20 <- function(y, ry, x, ...){
CALIBERrfimpute::mice.impute.rfcont(
y = y, ry = ry, x = x, ntree_cont = 20)
}
mice.impute.rfcont50 <- function(y, ry, x, ...){
CALIBERrfimpute::mice.impute.rfcont(
y = y, ry = ry, x = x, ntree_cont = 50)
}
mice.impute.rfcont100 <- function(y, ry, x, ...){
CALIBERrfimpute::mice.impute.rfcont(
y = y, ry = ry, x = x, ntree_cont = 100)
}
#### FUNCTIONS TO DO THE ANALYSIS ####
coxfull <- function(data){
# Full data analysis
coefs <- as.data.frame(summary(coxph(myformula, data = data))$coef)
# return a data.frame of coefficients (est), upper and lower 95% limits
out <- data.frame(est = coefs[, 'coef'],
lo95 = coefs[, 'coef'] + qnorm(0.025) * coefs[, 'se(coef)'],
hi95 = coefs[, 'coef'] + qnorm(0.975) * coefs[, 'se(coef)'],
row.names = row.names(coefs))
out$cover <- kLogHR >= out$lo95 & kLogHR <= out$hi95
out
}
coximpute <- function(imputed_datasets){
# Analyses a list of imputed datasets
docoxmodel <- function(data){
coxph(myformula, data = data)
}
mirafits <- as.mira(lapply(imputed_datasets, docoxmodel))
coefs <- as.data.frame(summary(pool(mirafits)))
if ('term' %in% colnames(coefs)){
row.names(coefs) <- as.character(coefs$term)
}
if (!('lo 95' %in% colnames(coefs))){
# newer version of mice
# use normal approximation for now, as assume large sample
# and large degrees of freedom for t distribution
out <- data.frame(est = coefs$estimate,
lo95 = coefs$estimate + qnorm(0.025) * coefs$std.error,
hi95 = coefs$estimate + qnorm(0.975) * coefs$std.error,
row.names = row.names(coefs))
} else if ('lo 95' %in% colnames(coefs)){
# older version of mice
out <- data.frame(est = coefs$est,
lo95 = coefs[, 'lo 95'], hi95 = coefs[, 'hi 95'],
row.names = row.names(coefs))
} else {
stop('Unable to handle format of summary.mipo object')
}
# Whether this confidence interval contains the true hazard ratio
out$cover <- kLogHR >= out$lo95 & kLogHR <= out$hi95
out
}
domissf <- function(missdata, reps = NIMPS){
# Imputation by missForest
out <- list()
for (i in 1:reps){
invisible(capture.output(
out[[i]] <- missForest(missdata)$ximp))
}
out
}
domice <- function(missdata, functions, reps = NIMPS){
mids <- mice(missdata, defaultMethod = functions,
m = reps, visitSequence = 'monotone',
printFlag = FALSE, maxit = 10)
lapply(1:reps, function(x) complete(mids, x))
}
doanalysis <- function(x){
# Creates a dataset, analyses it using different methods, and outputs
# the result as a matrix of coefficients / SE and coverage
data <- makeSurv(kSampleSize)
missdata <- makeMarSurv(data)
out <- list()
out$full <- coxfull(data)
out$missf <- coximpute(domissf(missdata))
out$rf5 <- coximpute(domice(missdata, 'rfcont5'))
out$rf10 <- coximpute(domice(missdata, 'rfcont10'))
out$rf20 <- coximpute(domice(missdata, 'rfcont20'))
out$rf100 <- coximpute(domice(missdata, 'rfcont100'))
out$rfdoove10 <- coximpute(domice(missdata, 'rfdoove10'))
out$rfdoove100 <- coximpute(domice(missdata, 'rfdoove100'))
out$cart <- coximpute(domice(missdata, 'cart'))
out$mice <- coximpute(domice(missdata, 'norm'))
out
}
###################################################
### code chunk number 3: simstudy_survival.Rnw:284-290
###################################################
# Chunk 3
mydata <- makeSurv(200)
plot(mydata[, c('x1', 'x2', 'x3')],
main = "Associations between predictor variables in a sample dataset")
mydata <- makeSurv(20000)
###################################################
### code chunk number 4: simstudy_survival.Rnw:295-298
###################################################
# Chunk 4
summary(lm(x3 ~ x1*x2, data = mydata))
###################################################
### code chunk number 5: simstudy_survival.Rnw:301-314
###################################################
# Chunk 5
mydata <- makeSurv(2000)
mydata2 <- makeMarSurv(mydata)
# Plot non-missing data
plot(mydata$x1[!is.na(mydata2$x3)], mydata$x3[!is.na(mydata2$x3)],
pch = 19, xlab = 'x1', ylab = 'x3')
# Plot missing data
points(mydata$x1[is.na(mydata2$x3)], mydata$x3[is.na(mydata2$x3)],
col = 'red', pch = 19)
legend('bottomright', legend = c('x3 observed', 'x3 missing'),
col = c('black', 'red'), pch = 19)
title('Association of predictor variables x1 and x3')
###################################################
### code chunk number 6: simstudy_survival.Rnw:319-345
###################################################
# Chunk 6
# Cox proportional hazards analysis
myformula <- as.formula(Surv(time, event) ~ x1 + x2 + x3)
# Analysis with 10,000 simulated patients (or more
# if the variable REFERENCE_SAMPLESIZE exists)
if (!exists('REFERENCE_SAMPLESIZE')){
REFERENCE_SAMPLESIZE <- 10000
}
# Use parallel processing, if available, to create
# datasets more quickly.
if ('parallel' %in% loadedNamespaces() &&
!is.null(getOption('mc.cores')) &&
.Platform$OS.type == 'unix'){
REFERENCE_SAMPLESIZE <- REFERENCE_SAMPLESIZE %/%
getOption('mc.cores')
simdata <- parallel::mclapply(1:getOption('mc.cores'),
function(x) makeSurv(REFERENCE_SAMPLESIZE))
simdata <- do.call('rbind', simdata)
} else {
simdata <- makeSurv(REFERENCE_SAMPLESIZE)
}
summary(coxph(myformula, data = simdata))
###################################################
### code chunk number 7: simstudy_survival.Rnw:367-387
###################################################
# Chunk 7
# Setting analysis parameters: To analyse more than 3 samples,
# set N to the desired number before running this program
if (!exists('N')){
N <- 3
}
# Number of imputations (set to at least 10 when
# running an actual simulation)
if (!exists('NIMPS')){
NIMPS <- 4
}
# Use parallel processing if the 'parallel' package is loaded
if ('parallel' %in% loadedNamespaces() &&
.Platform$OS.type == 'unix'){
cat('Using parallel processing\n')
results <- parallel::mclapply(1:N, doanalysis)
} else {
results <- lapply(1:N, doanalysis)
}
###################################################
### code chunk number 8: simstudy_survival.Rnw:416-455
###################################################
# Chunk 8
getParams <- function(coef, method){
estimates <- sapply(results, function(x){
x[[method]][coef, 'est']
})
bias <- mean(estimates) - kLogHR
se_bias <- sd(estimates) / sqrt(length(estimates))
mse <- mean((estimates - kLogHR) ^ 2)
ci_len <- mean(sapply(results, function(x){
x[[method]][coef, 'hi95'] - x[[method]][coef, 'lo95']
}))
ci_cov <- mean(sapply(results, function(x){
x[[method]][coef, 'cover']
}))
out <- c(bias, se_bias, mse, sd(estimates), ci_len, ci_cov)
names(out) <- c('bias', 'se_bias', 'mse', 'sd', 'ci_len', 'ci_cov')
out
}
showTable <- function(coef){
methods <- c('full', 'missf', 'cart', 'rfdoove10',
'rfdoove100', 'rf5', 'rf10', 'rf20', 'rf100', 'mice')
methodnames <- c('Full data', 'missForest', 'CART MICE',
'RF Doove MICE 10', 'RF Doove MICE 100',
paste('RFcont MICE', c(5, 10, 20, 100)),
'Parametric MICE')
out <- t(sapply(methods, function(x){
getParams(coef, x)
}))
out <- formatC(out, digits = 3, format = 'fg')
out <- rbind(c('', 'Standard', 'Mean', 'SD of', 'Mean 95%',
'95% CI'), c('Bias', 'error of bias', 'square error', 'estimate',
'CI length', 'coverage'), out)
out <- cbind(c('', '', methodnames), out)
rownames(out) <- NULL
print(xtable(out), floating = FALSE, include.rownames = FALSE,
include.colnames = FALSE, hline.after = c(0, 2, nrow(out)))
}
###################################################
### code chunk number 9: simstudy_survival.Rnw:468-471
###################################################
# Chunk 9
showTable('x1')
###################################################
### code chunk number 10: simstudy_survival.Rnw:480-483
###################################################
# Chunk 10
showTable('x2')
###################################################
### code chunk number 11: simstudy_survival.Rnw:493-496
###################################################
# Chunk 11
showTable('x3')
###################################################
### code chunk number 12: simstudy_survival.Rnw:506-530
###################################################
# Chunk 12
numtrees <- c(5, 10, 20, 100)
bias <- sapply(numtrees, function(x){
getParams('x3', paste('rf', x, sep=''))['bias']
})
se_bias <- sapply(numtrees, function(x){
getParams('x3', paste('rf', x, sep=''))['se_bias']
})
lower_bias <- bias - 1.96*se_bias
upper_bias <- bias + 1.96*se_bias
# Blank plot
plot(-100, 0, type = 'p', pch = 15, cex = 1.3, ylab = 'Bias',
xlab = 'Number of trees', xlim = c(0,100),
ylim = c(min(lower_bias), max(upper_bias)))
# Zero bias line
lines(c(0,100), c(0,0), lty = 2, col = 'gray')
# Confidence interval lines
for (i in 1:5){lines(rep(numtrees[i], 2),
c(lower_bias[i], upper_bias[i]))}
# Points
points(numtrees, bias, pch = 15, cex = 1.3)
title('Bias in estimate of x3 coefficient after\nmultiple imputation using RFcont MICE')
###################################################
### code chunk number 13: simstudy_survival.Rnw:535-690
###################################################
# Chunk 13
# Comparing confidence interval coverage and bias between:
# RF MICE 100 trees
# RF MICE 10 trees
# Parametric MICE
# Names of the variables in the comparison
variables <- c('x1', 'x2', 'x3')
pstar <- function(x){
if (!is.na(x)){
if (x < 0.001){
'***'
} else if (x < 0.01){
'**'
} else if (x < 0.05){
'*'
} else {
''
}
} else {
''
}
}
compareBias <- function(method1, method2){
# Generates a table comparing bias
# Comparison statistic is the difference in absolute bias
# (negative means first method is better)
compareBiasVar <- function(varname){
# All coefficients should be kLogHR
bias1 <- sapply(results, function(x){
x[[method1]][varname, 'est']
}) - kLogHR
bias2 <- sapply(results, function(x){
x[[method2]][varname, 'est']
}) - kLogHR
if (sign(mean(bias1)) == -1){
bias1 <- -bias1
}
if (sign(mean(bias2)) == -1){
bias2 <- -bias2
}
paste(formatC(mean(bias1) - mean(bias2), format = 'fg', digits = 3),
pstar(t.test(bias1 - bias2)$p.value))
}
sapply(variables, compareBiasVar)
}
compareVariance <- function(method1, method2){
# Generates a table comparing precision between two methods
# Comparison statistic is ratio of variance
# (smaller means first method is better)
compareVarianceVar <- function(varname){
e1 <- sapply(results, function(x){
x[[method1]][varname, 'est']
})
e2 <- sapply(results, function(x){
x[[method2]][varname, 'est']
})
paste(formatC(var(e1) / var(e2), format = 'fg', digits = 3),
pstar(var.test(e1, e2)$p.value))
}
sapply(variables, compareVarianceVar)
}
compareCIlength <- function(method1, method2){
# Generates a table comparing coverage percentage between two methods
# Comparison statistic is the ratio of confidence interval lengths
# (less than 1 = first better)
compareCIlengthVar <- function(varname){
# Paired t test for bias (difference in estimate)
len1 <- sapply(results, function(x){
x[[method1]][varname, 'hi95'] -
x[[method1]][varname, 'lo95']
})
len2 <- sapply(results, function(x){
x[[method2]][varname, 'hi95'] -
x[[method2]][varname, 'lo95']
})
paste(formatC(mean(len1) / mean(len2),
format = 'fg', digits = 4),
pstar(t.test(len1 - len2)$p.value))
}
sapply(variables, compareCIlengthVar)
}
compareCoverage <- function(method1, method2){
# Generates a table comparing coverage percentage between two methods
# Comparison statistic is the difference in coverage
# (positive = first better)
compareCoverageVar <- function(varname){
# Paired t test for bias (difference in estimate)
cov1 <- sapply(results, function(x){
x[[method1]][varname, 'cover']
})
cov2 <- sapply(results, function(x){
x[[method2]][varname, 'cover']
})
paste(formatC(100 * (mean(cov1) - mean(cov2)), format = 'f',
digits = 1),
pstar(binom.test(c(sum(cov1 == TRUE & cov2 == FALSE),
sum(cov1 == FALSE & cov2 == TRUE)))$p.value))
}
sapply(variables, compareCoverageVar)
}
maketable <- function(comparison){
# comparison is a function such as compareCoverage, compareBias
compare <- cbind(comparison('rf10', 'mice'),
comparison('rf100', 'mice'),
comparison('rf10', 'rf100'))
compare <- cbind(rownames(compare), compare)
compare <- rbind(
c('', 'RFcont MICE 10 vs', 'RFcont MICE 100 vs',
'RFcont MICE 10 vs'),
c('Coefficient', 'parametric MICE',
'parametric MICE', 'RFcont MICE 100'),
compare)
rownames(compare) <- NULL
print(xtable(compare), include.rownames = FALSE,
include.colnames = FALSE, floating = FALSE,
hline.after = c(0, 2, nrow(compare)))
cat('\n\\vspace{1em}\n')
compare <- cbind(comparison('rfdoove10', 'rf10'),
comparison('rfdoove10', 'cart'),
comparison('rfdoove10', 'rfdoove100'))
compare <- cbind(rownames(compare), compare)
compare <- rbind(
c('', 'RF Doove MICE 10 vs', 'RF Doove MICE 10 vs',
'RF Doove MICE 10 vs'),
c('Coefficient', 'RFcont MICE 10',
'CART MICE', 'RF Doove MICE 100'),
compare)
rownames(compare) <- NULL
print(xtable(compare), include.rownames = FALSE,
include.colnames = FALSE, floating = FALSE,
hline.after = c(0, 2, nrow(compare)))
}
###################################################
### code chunk number 14: simstudy_survival.Rnw:699-702
###################################################
# Chunk 14
maketable(compareBias)
###################################################
### code chunk number 15: simstudy_survival.Rnw:711-714
###################################################
# Chunk 15
maketable(compareVariance)
###################################################
### code chunk number 16: simstudy_survival.Rnw:724-727
###################################################
# Chunk 16
maketable(compareCIlength)
###################################################
### code chunk number 17: simstudy_survival.Rnw:736-739
###################################################
# Chunk 17
maketable(compareCoverage)
###################################################
### code chunk number 18: simstudy_survival.Rnw:773-784
###################################################
# Chunk 18
showfunction <- function(functionname){
cat(paste(functionname, '<-',
paste(capture.output(print(get(functionname))),
collapse = '\n')))
cat('\n')
invisible(NULL)
}
showfunction('makeSurv')
showfunction('makeMarSurv')
###################################################
### code chunk number 19: simstudy_survival.Rnw:789-803
###################################################
# Chunk 19
showfunction('coxfull')
showfunction('coximpute')
showfunction('domissf')
showfunction('mice.impute.cart')
showfunction('mice.impute.rfdoove10')
showfunction('mice.impute.rfdoove100')
showfunction('mice.impute.rfcont5')
showfunction('mice.impute.rfcont10')
showfunction('mice.impute.rfcont20')
showfunction('mice.impute.rfcont100')
showfunction('domice')
showfunction('doanalysis')
###################################################
### code chunk number 20: simstudy_survival.Rnw:808-815
###################################################
# Chunk 20
showfunction('pstar')
showfunction('compareBias')
showfunction('compareVariance')
showfunction('compareCIlength')
showfunction('compareCoverage')
###################################################
### code chunk number 21: simstudy_survival.Rnw:820-825
###################################################
# Chunk 21
showfunction('getParams')
showfunction('showTable')
showfunction('maketable')
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.