simstudy_survival.R
In CALIBERrfimpute: Multiple Imputation Using MICE and Random Forest

### R code from vignette source 'simstudy_survival.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: simstudy_survival.Rnw:13-43
###################################################
# Chunk 1
library(CALIBERrfimpute)
library(missForest)
library(survival)
library(xtable)
library(rpart)
library(mice)
library(ranger)

kPmiss <- 0.2 # probability of missingness
kLogHR <- 0.5 # true log hazard ratio

# To analyse samples of more than 200 patients, (recommend about 2000,
# but this will slow down the program), set NPATS before running
# this vignette.
if (!exists('NPATS')){
	kSampleSize <- 200 # number of patients in simulated datasets
} else {
	kSampleSize <- NPATS
}
# e.g.
# NPATS <- 2000

# To analyse more than 3 samples, set N to a number greater than 3
# e.g.
# N <- 1000

# To use more than 4 imputations, set NIMPS to a number greater than 4
# e.g.
# NIMPS <- 10


###################################################
### code chunk number 2: simstudy_survival.Rnw:86-280
###################################################
# Chunk 2

#### DATA GENERATING FUNCTIONS ####

makeSurv <- function(n = 2000, loghr = kLogHR){
	# Creates a survival cohort of n patients. Assumes that censoring is
	# independent of all other variables
	
	# x1 and x2 are random normal variables
	data <- data.frame(x1 = rnorm(n), x2 = rnorm(n))
	
	# Create the x3 variable
	data$x3 <- 0.5 * (data$x1 + data$x2 - data$x1 * data$x2) + rnorm(n)
	
	# Underlying log hazard ratio for all variables is the same
	data$y <- with(data, loghr * (x1 + x2 + x3))
	data$survtime <- rexp(n, exp(data$y))

	# Censoring - assume uniform distribution of observation times
	# up to a maximum
	obstime <- runif(nrow(data), min = 0,
		max = quantile(data$survtime, 0.5))
	data$event <- as.integer(data$survtime <= obstime)
	# Generate integer survival times
	data$time <- ceiling(100 * pmin(data$survtime, obstime))
	
	# Observed marginal cumulative hazard for imputation models
	data$cumhaz <- nelsonaalen(data, time, event) 
	
	# True log hazard and survival time are not seen in the data
	# so remove them
	data$y <- NULL
	data$survtime <- NULL
	
	return(data)
}

makeMarSurv <- function(data, pmissing = kPmiss){
	# Introduces missing data dependent on event indicator
	# and cumulative hazard and x1 and x2
	
	logistic <- function(x){
		exp(x) / (1 + exp(x))
	}

	predictions <- function(lp, n){
		# uses the vector of linear predictions (lp) from a logistic model
		# and the expected number of positive responses (n) to generate
		# a set of predictions by modifying the baseline
	
		trialn <- function(lptrial){
			sum(logistic(lptrial))
		}
		stepsize <- 32
		lptrial <- lp
		# To avoid errors due to missing linear predictors (ideally
		# there should not be any missing), replace with the mean
		if (any(is.na(lptrial))){
			lp[is.na(lptrial)] <- mean(lptrial, na.rm = TRUE)
		}
		while(abs(trialn(lptrial) - n) > 1){
			if (trialn(lptrial) > n){
				# trialn bigger than required 
				lptrial <- lptrial - stepsize
			} else {
				lptrial <- lptrial + stepsize
			}
			stepsize <- stepsize / 2
		}
		# Generate predictions from binomial distribution
		as.logical(rbinom(logical(length(lp)), 1, logistic(lptrial)))
	}
	data$x3[predictions(0.1 * data$x1 + 0.1 * data$x2 +
		0.1 * data$cumhaz + 0.1 * data$event, nrow(data) * pmissing)] <- NA
	return(data)
}

#### IMPUTATION FUNCTIONS FROM DOOVE AND VAN BUUREN ####

mice.impute.rfdoove10 <- function(y, ry, x, ...){
	mice::mice.impute.rf(y = y, ry = ry, x = x, ntrees = 10)
}

mice.impute.rfdoove100 <- function(y, ry, x, ...){
	mice::mice.impute.rf(y = y, ry = ry, x = x, ntrees = 100)
}

#### OUR MICE RANDOM FOREST FUNCTIONS ####

mice.impute.rfcont5 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 5)
}

mice.impute.rfcont10 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 10)
}

mice.impute.rfcont20 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 20)
}

mice.impute.rfcont50 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 50)
}

mice.impute.rfcont100 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 100)
}

#### FUNCTIONS TO DO THE ANALYSIS ####

coxfull <- function(data){
	# Full data analysis
	coefs <- as.data.frame(summary(coxph(myformula, data = data))$coef)
	# return a data.frame of coefficients (est), upper and lower 95% limits
	out <- data.frame(est = coefs[, 'coef'], 
		lo95 = coefs[, 'coef'] + qnorm(0.025) * coefs[, 'se(coef)'],
		hi95 = coefs[, 'coef'] + qnorm(0.975) * coefs[, 'se(coef)'],
		row.names = row.names(coefs))
	out$cover <- kLogHR >= out$lo95 & kLogHR <= out$hi95
	out
}

coximpute <- function(imputed_datasets){
	# Analyses a list of imputed datasets
	docoxmodel <- function(data){
		coxph(myformula, data = data)
	}
	mirafits <- as.mira(lapply(imputed_datasets, docoxmodel))
	coefs <- as.data.frame(summary(pool(mirafits)))
	if ('term' %in% colnames(coefs)){
		row.names(coefs) <- as.character(coefs$term)
	}
	if (!('lo 95' %in% colnames(coefs))){
		# newer version of mice
		# use normal approximation for now, as assume large sample
		# and large degrees of freedom for t distribution
		out <- data.frame(est = coefs$estimate,
			lo95 = coefs$estimate + qnorm(0.025) * coefs$std.error,
			hi95 = coefs$estimate + qnorm(0.975) * coefs$std.error,
			row.names = row.names(coefs))
	} else if ('lo 95' %in% colnames(coefs)){
		# older version of mice
		out <- data.frame(est = coefs$est,
			lo95 = coefs[, 'lo 95'], hi95 = coefs[, 'hi 95'],
			row.names = row.names(coefs))
	} else {
		stop('Unable to handle format of summary.mipo object')
	}
	# Whether this confidence interval contains the true hazard ratio
	out$cover <- kLogHR >= out$lo95 & kLogHR <= out$hi95
	out
}

domissf <- function(missdata, reps = NIMPS){
	# Imputation by missForest
	out <- list()
	for (i in 1:reps){
		invisible(capture.output(
			out[[i]] <- missForest(missdata)$ximp))
	}
	out
}

domice <- function(missdata, functions, reps = NIMPS){
	mids <- mice(missdata, defaultMethod = functions,
		m = reps, visitSequence = 'monotone',
		printFlag = FALSE, maxit = 10)
	lapply(1:reps, function(x) complete(mids, x))
}

doanalysis <- function(x){
	# Creates a dataset, analyses it using different methods, and outputs
	# the result as a matrix of coefficients / SE and coverage 
	data <- makeSurv(kSampleSize)
	missdata <- makeMarSurv(data)
	out <- list()
	out$full <- coxfull(data)
	out$missf <- coximpute(domissf(missdata))
	out$rf5 <- coximpute(domice(missdata, 'rfcont5'))
	out$rf10 <- coximpute(domice(missdata, 'rfcont10'))
	out$rf20 <- coximpute(domice(missdata, 'rfcont20'))
	out$rf100 <- coximpute(domice(missdata, 'rfcont100'))
	out$rfdoove10 <- coximpute(domice(missdata, 'rfdoove10'))
	out$rfdoove100 <- coximpute(domice(missdata, 'rfdoove100'))
	out$cart <- coximpute(domice(missdata, 'cart'))
	out$mice <- coximpute(domice(missdata, 'norm'))
	out
}


###################################################
### code chunk number 3: simstudy_survival.Rnw:284-290
###################################################
# Chunk 3

mydata <- makeSurv(200)
plot(mydata[, c('x1', 'x2', 'x3')],
	main = "Associations between predictor variables in a sample dataset")
mydata <- makeSurv(20000)


###################################################
### code chunk number 4: simstudy_survival.Rnw:295-298
###################################################
# Chunk 4

summary(lm(x3 ~ x1*x2, data = mydata))


###################################################
### code chunk number 5: simstudy_survival.Rnw:301-314
###################################################
# Chunk 5

mydata <- makeSurv(2000)
mydata2 <- makeMarSurv(mydata)
# Plot non-missing data
plot(mydata$x1[!is.na(mydata2$x3)], mydata$x3[!is.na(mydata2$x3)],
	pch = 19, xlab = 'x1', ylab = 'x3')
# Plot missing data
points(mydata$x1[is.na(mydata2$x3)], mydata$x3[is.na(mydata2$x3)],
	col = 'red', pch = 19)
legend('bottomright', legend = c('x3 observed', 'x3 missing'),
	col = c('black', 'red'), pch = 19)
title('Association of predictor variables x1 and x3')


###################################################
### code chunk number 6: simstudy_survival.Rnw:319-345
###################################################
# Chunk 6

# Cox proportional hazards analysis
myformula <- as.formula(Surv(time, event) ~ x1 + x2 + x3)

# Analysis with 10,000 simulated patients (or more
# if the variable REFERENCE_SAMPLESIZE exists)
if (!exists('REFERENCE_SAMPLESIZE')){
	REFERENCE_SAMPLESIZE <- 10000
}

# Use parallel processing, if available, to create
# datasets more quickly.
if ('parallel' %in% loadedNamespaces() &&
	!is.null(getOption('mc.cores')) &&
	.Platform$OS.type == 'unix'){
	REFERENCE_SAMPLESIZE <- REFERENCE_SAMPLESIZE %/%
		getOption('mc.cores')
	simdata <- parallel::mclapply(1:getOption('mc.cores'),
		function(x) makeSurv(REFERENCE_SAMPLESIZE))
	simdata <- do.call('rbind', simdata)
} else {
	simdata <- makeSurv(REFERENCE_SAMPLESIZE)
}

summary(coxph(myformula, data = simdata))


###################################################
### code chunk number 7: simstudy_survival.Rnw:367-387
###################################################
# Chunk 7

# Setting analysis parameters: To analyse more than 3 samples,
# set N to the desired number before running this program
if (!exists('N')){
	N <- 3
}
# Number of imputations (set to at least 10 when
# running an actual simulation)
if (!exists('NIMPS')){
	NIMPS <- 4
}
# Use parallel processing if the 'parallel' package is loaded
if ('parallel' %in% loadedNamespaces() &&
	.Platform$OS.type == 'unix'){
	cat('Using parallel processing\n')
	results <- parallel::mclapply(1:N, doanalysis)
} else {
	results <- lapply(1:N, doanalysis)
}


###################################################
### code chunk number 8: simstudy_survival.Rnw:416-455
###################################################
# Chunk 8

getParams <- function(coef, method){
	estimates <- sapply(results, function(x){
		x[[method]][coef, 'est']
	})
	bias <- mean(estimates) - kLogHR
	se_bias <- sd(estimates) / sqrt(length(estimates))
	mse <- mean((estimates - kLogHR) ^ 2)
	ci_len <- mean(sapply(results, function(x){
		x[[method]][coef, 'hi95'] - x[[method]][coef, 'lo95']
	}))
	ci_cov <- mean(sapply(results, function(x){
		x[[method]][coef, 'cover']
	}))
	out <- c(bias, se_bias, mse, sd(estimates), ci_len, ci_cov)
	names(out) <- c('bias', 'se_bias', 'mse', 'sd', 'ci_len', 'ci_cov')
	out
}

showTable <- function(coef){
	methods <- c('full', 'missf', 'cart', 'rfdoove10',
		'rfdoove100', 'rf5', 'rf10', 'rf20', 'rf100', 'mice')
	methodnames <- c('Full data', 'missForest', 'CART MICE',
		'RF Doove MICE 10', 'RF Doove MICE 100',
		paste('RFcont MICE', c(5, 10, 20, 100)),
		'Parametric MICE')
	out <- t(sapply(methods, function(x){
		getParams(coef, x)
	}))
	out <- formatC(out, digits = 3, format = 'fg')
	out <- rbind(c('', 'Standard', 'Mean', 'SD of', 'Mean 95%',
		'95% CI'), c('Bias', 'error of bias', 'square error', 'estimate',
		'CI length', 'coverage'), out)
	out <- cbind(c('', '', methodnames), out)
	rownames(out) <- NULL
	print(xtable(out), floating = FALSE, include.rownames = FALSE,
		include.colnames = FALSE, hline.after = c(0, 2, nrow(out)))
}


###################################################
### code chunk number 9: simstudy_survival.Rnw:468-471
###################################################
# Chunk 9

showTable('x1')


###################################################
### code chunk number 10: simstudy_survival.Rnw:480-483
###################################################
# Chunk 10

showTable('x2')


###################################################
### code chunk number 11: simstudy_survival.Rnw:493-496
###################################################
# Chunk 11

showTable('x3')


###################################################
### code chunk number 12: simstudy_survival.Rnw:506-530
###################################################
# Chunk 12

numtrees <- c(5, 10, 20, 100)
bias <- sapply(numtrees, function(x){
	getParams('x3', paste('rf', x, sep=''))['bias']
})
se_bias <- sapply(numtrees, function(x){
	getParams('x3', paste('rf', x, sep=''))['se_bias']
})
lower_bias <- bias - 1.96*se_bias
upper_bias <- bias + 1.96*se_bias

# Blank plot
plot(-100, 0, type = 'p', pch = 15, cex = 1.3, ylab = 'Bias', 
	xlab = 'Number of trees', xlim = c(0,100),
	ylim = c(min(lower_bias), max(upper_bias)))
# Zero bias line
lines(c(0,100), c(0,0), lty = 2, col = 'gray')
# Confidence interval lines
for (i in 1:5){lines(rep(numtrees[i], 2),
	c(lower_bias[i], upper_bias[i]))}
# Points
points(numtrees, bias, pch = 15, cex = 1.3)
title('Bias in estimate of x3 coefficient after\nmultiple imputation using RFcont MICE')


###################################################
### code chunk number 13: simstudy_survival.Rnw:535-690
###################################################
# Chunk 13

# Comparing confidence interval coverage and bias between:
#    RF MICE 100 trees
#    RF MICE 10 trees
#    Parametric MICE

# Names of the variables in the comparison
variables <- c('x1', 'x2', 'x3')

pstar <- function(x){
	if (!is.na(x)){
		if (x < 0.001){
			'***'
		} else if (x < 0.01){
			'**'
		} else if (x < 0.05){
			'*'
		} else {
			''
		}
	} else {
		''
	}
}

compareBias <- function(method1, method2){
	# Generates a table comparing bias
	# Comparison statistic is the difference in absolute bias
	# (negative means first method is better)
	
	compareBiasVar <- function(varname){
		# All coefficients should be kLogHR
		bias1 <- sapply(results, function(x){
			x[[method1]][varname, 'est']
		}) - kLogHR
		bias2 <- sapply(results, function(x){
			x[[method2]][varname, 'est']
		}) - kLogHR

		if (sign(mean(bias1)) == -1){
			bias1 <- -bias1
		}
		if (sign(mean(bias2)) == -1){
			bias2 <- -bias2
		}
		
		paste(formatC(mean(bias1) - mean(bias2), format = 'fg', digits = 3),
			pstar(t.test(bias1 - bias2)$p.value))
	}
	
	sapply(variables, compareBiasVar)
}

compareVariance <- function(method1, method2){
	# Generates a table comparing precision between two methods
	# Comparison statistic is ratio of variance
	# (smaller means first method is better)
	
	compareVarianceVar <- function(varname){
		e1 <- sapply(results, function(x){
			x[[method1]][varname, 'est']
		})
		e2 <- sapply(results, function(x){
			x[[method2]][varname, 'est']
		})
		paste(formatC(var(e1) / var(e2), format = 'fg', digits = 3),
			pstar(var.test(e1, e2)$p.value))
	}
	
	sapply(variables, compareVarianceVar)
}

compareCIlength <- function(method1, method2){
	# Generates a table comparing coverage percentage between two methods
	# Comparison statistic is the ratio of confidence interval lengths
	# (less than 1 = first better)
	
	compareCIlengthVar <- function(varname){
		# Paired t test for bias (difference in estimate)
		len1 <- sapply(results, function(x){
			x[[method1]][varname, 'hi95'] -
				x[[method1]][varname, 'lo95']
		})
		len2 <- sapply(results, function(x){
			x[[method2]][varname, 'hi95'] -
				x[[method2]][varname, 'lo95']
		})
		
		paste(formatC(mean(len1) / mean(len2),
			format = 'fg', digits = 4),
			pstar(t.test(len1 - len2)$p.value))
	}
	
	sapply(variables, compareCIlengthVar)	
}

compareCoverage <- function(method1, method2){
	# Generates a table comparing coverage percentage between two methods
	# Comparison statistic is the difference in coverage
	# (positive = first better)

	compareCoverageVar <- function(varname){
		# Paired t test for bias (difference in estimate)
		
		cov1 <- sapply(results, function(x){
			x[[method1]][varname, 'cover']
		})
		cov2 <- sapply(results, function(x){
			x[[method2]][varname, 'cover']
		})
				
		paste(formatC(100 * (mean(cov1) - mean(cov2)), format = 'f',
			digits = 1),
			pstar(binom.test(c(sum(cov1 == TRUE  & cov2 == FALSE),
			sum(cov1 == FALSE & cov2 == TRUE)))$p.value))
	}
	
	sapply(variables, compareCoverageVar)	
}

maketable <- function(comparison){
	# comparison is a function such as compareCoverage, compareBias
	compare <- cbind(comparison('rf10', 'mice'),
		comparison('rf100', 'mice'),
		comparison('rf10', 'rf100'))
	compare <- cbind(rownames(compare), compare)
	compare <- rbind(
		c('', 'RFcont MICE 10 vs', 'RFcont MICE 100 vs',
			'RFcont MICE 10 vs'),
		c('Coefficient', 'parametric MICE',
			'parametric MICE', 'RFcont MICE 100'),
		compare)
	rownames(compare) <- NULL
	print(xtable(compare), include.rownames = FALSE,
		include.colnames = FALSE, floating = FALSE,
		hline.after = c(0, 2, nrow(compare)))
	
	cat('\n\\vspace{1em}\n')
	
	compare <- cbind(comparison('rfdoove10', 'rf10'),
		comparison('rfdoove10', 'cart'),
		comparison('rfdoove10', 'rfdoove100'))
	compare <- cbind(rownames(compare), compare)
	compare <- rbind(
		c('', 'RF Doove MICE 10 vs', 'RF Doove MICE 10 vs',
			'RF Doove MICE 10 vs'),
		c('Coefficient', 'RFcont MICE 10',
			'CART MICE', 'RF Doove MICE 100'),
		compare)
	rownames(compare) <- NULL
	print(xtable(compare), include.rownames = FALSE,
		include.colnames = FALSE, floating = FALSE,
		hline.after = c(0, 2, nrow(compare)))
}


###################################################
### code chunk number 14: simstudy_survival.Rnw:699-702
###################################################
# Chunk 14

maketable(compareBias)


###################################################
### code chunk number 15: simstudy_survival.Rnw:711-714
###################################################
# Chunk 15

maketable(compareVariance)


###################################################
### code chunk number 16: simstudy_survival.Rnw:724-727
###################################################
# Chunk 16

maketable(compareCIlength)


###################################################
### code chunk number 17: simstudy_survival.Rnw:736-739
###################################################
# Chunk 17

maketable(compareCoverage)


###################################################
### code chunk number 18: simstudy_survival.Rnw:773-784
###################################################
# Chunk 18

showfunction <- function(functionname){
	cat(paste(functionname, '<-',
		paste(capture.output(print(get(functionname))),
		collapse = '\n')))
	cat('\n')
	invisible(NULL)
}
showfunction('makeSurv')
showfunction('makeMarSurv')


###################################################
### code chunk number 19: simstudy_survival.Rnw:789-803
###################################################
# Chunk 19

showfunction('coxfull')
showfunction('coximpute')
showfunction('domissf')
showfunction('mice.impute.cart')
showfunction('mice.impute.rfdoove10')
showfunction('mice.impute.rfdoove100')
showfunction('mice.impute.rfcont5')
showfunction('mice.impute.rfcont10')
showfunction('mice.impute.rfcont20')
showfunction('mice.impute.rfcont100')
showfunction('domice')
showfunction('doanalysis')


###################################################
### code chunk number 20: simstudy_survival.Rnw:808-815
###################################################
# Chunk 20

showfunction('pstar')
showfunction('compareBias')
showfunction('compareVariance')
showfunction('compareCIlength')
showfunction('compareCoverage')


###################################################
### code chunk number 21: simstudy_survival.Rnw:820-825
###################################################
# Chunk 21

showfunction('getParams')
showfunction('showTable')
showfunction('maketable')