inst/doc/simstudy_survival.R

### R code from vignette source 'simstudy_survival.Rnw'
### Encoding: UTF-8

###################################################
### code chunk number 1: simstudy_survival.Rnw:13-43
###################################################
# Chunk 1
library(CALIBERrfimpute)
library(missForest)
library(survival)
library(xtable)
library(rpart)
library(mice)
library(ranger)

kPmiss <- 0.2 # probability of missingness
kLogHR <- 0.5 # true log hazard ratio

# To analyse samples of more than 200 patients, (recommend about 2000,
# but this will slow down the program), set NPATS before running
# this vignette.
if (!exists('NPATS')){
	kSampleSize <- 200 # number of patients in simulated datasets
} else {
	kSampleSize <- NPATS
}
# e.g.
# NPATS <- 2000

# To analyse more than 3 samples, set N to a number greater than 3
# e.g.
# N <- 1000

# To use more than 4 imputations, set NIMPS to a number greater than 4
# e.g.
# NIMPS <- 10


###################################################
### code chunk number 2: simstudy_survival.Rnw:86-280
###################################################
# Chunk 2

#### DATA GENERATING FUNCTIONS ####

makeSurv <- function(n = 2000, loghr = kLogHR){
	# Creates a survival cohort of n patients. Assumes that censoring is
	# independent of all other variables
	
	# x1 and x2 are random normal variables
	data <- data.frame(x1 = rnorm(n), x2 = rnorm(n))
	
	# Create the x3 variable
	data$x3 <- 0.5 * (data$x1 + data$x2 - data$x1 * data$x2) + rnorm(n)
	
	# Underlying log hazard ratio for all variables is the same
	data$y <- with(data, loghr * (x1 + x2 + x3))
	data$survtime <- rexp(n, exp(data$y))

	# Censoring - assume uniform distribution of observation times
	# up to a maximum
	obstime <- runif(nrow(data), min = 0,
		max = quantile(data$survtime, 0.5))
	data$event <- as.integer(data$survtime <= obstime)
	# Generate integer survival times
	data$time <- ceiling(100 * pmin(data$survtime, obstime))
	
	# Observed marginal cumulative hazard for imputation models
	data$cumhaz <- nelsonaalen(data, time, event) 
	
	# True log hazard and survival time are not seen in the data
	# so remove them
	data$y <- NULL
	data$survtime <- NULL
	
	return(data)
}

makeMarSurv <- function(data, pmissing = kPmiss){
	# Introduces missing data dependent on event indicator
	# and cumulative hazard and x1 and x2
	
	logistic <- function(x){
		exp(x) / (1 + exp(x))
	}

	predictions <- function(lp, n){
		# uses the vector of linear predictions (lp) from a logistic model
		# and the expected number of positive responses (n) to generate
		# a set of predictions by modifying the baseline
	
		trialn <- function(lptrial){
			sum(logistic(lptrial))
		}
		stepsize <- 32
		lptrial <- lp
		# To avoid errors due to missing linear predictors (ideally
		# there should not be any missing), replace with the mean
		if (any(is.na(lptrial))){
			lp[is.na(lptrial)] <- mean(lptrial, na.rm = TRUE)
		}
		while(abs(trialn(lptrial) - n) > 1){
			if (trialn(lptrial) > n){
				# trialn bigger than required 
				lptrial <- lptrial - stepsize
			} else {
				lptrial <- lptrial + stepsize
			}
			stepsize <- stepsize / 2
		}
		# Generate predictions from binomial distribution
		as.logical(rbinom(logical(length(lp)), 1, logistic(lptrial)))
	}
	data$x3[predictions(0.1 * data$x1 + 0.1 * data$x2 +
		0.1 * data$cumhaz + 0.1 * data$event, nrow(data) * pmissing)] <- NA
	return(data)
}

#### IMPUTATION FUNCTIONS FROM DOOVE AND VAN BUUREN ####

mice.impute.rfdoove10 <- function(y, ry, x, ...){
	mice::mice.impute.rf(y = y, ry = ry, x = x, ntrees = 10)
}

mice.impute.rfdoove100 <- function(y, ry, x, ...){
	mice::mice.impute.rf(y = y, ry = ry, x = x, ntrees = 100)
}

#### OUR MICE RANDOM FOREST FUNCTIONS ####

mice.impute.rfcont5 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 5)
}

mice.impute.rfcont10 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 10)
}

mice.impute.rfcont20 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 20)
}

mice.impute.rfcont50 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 50)
}

mice.impute.rfcont100 <- function(y, ry, x, ...){
	CALIBERrfimpute::mice.impute.rfcont(
		y = y, ry = ry, x = x, ntree_cont = 100)
}

#### FUNCTIONS TO DO THE ANALYSIS ####

coxfull <- function(data){
	# Full data analysis
	coefs <- as.data.frame(summary(coxph(myformula, data = data))$coef)
	# return a data.frame of coefficients (est), upper and lower 95% limits
	out <- data.frame(est = coefs[, 'coef'], 
		lo95 = coefs[, 'coef'] + qnorm(0.025) * coefs[, 'se(coef)'],
		hi95 = coefs[, 'coef'] + qnorm(0.975) * coefs[, 'se(coef)'],
		row.names = row.names(coefs))
	out$cover <- kLogHR >= out$lo95 & kLogHR <= out$hi95
	out
}

coximpute <- function(imputed_datasets){
	# Analyses a list of imputed datasets
	docoxmodel <- function(data){
		coxph(myformula, data = data)
	}
	mirafits <- as.mira(lapply(imputed_datasets, docoxmodel))
	coefs <- as.data.frame(summary(pool(mirafits)))
	if ('term' %in% colnames(coefs)){
		row.names(coefs) <- as.character(coefs$term)
	}
	if (!('lo 95' %in% colnames(coefs))){
		# newer version of mice
		# use normal approximation for now, as assume large sample
		# and large degrees of freedom for t distribution
		out <- data.frame(est = coefs$estimate,
			lo95 = coefs$estimate + qnorm(0.025) * coefs$std.error,
			hi95 = coefs$estimate + qnorm(0.975) * coefs$std.error,
			row.names = row.names(coefs))
	} else if ('lo 95' %in% colnames(coefs)){
		# older version of mice
		out <- data.frame(est = coefs$est,
			lo95 = coefs[, 'lo 95'], hi95 = coefs[, 'hi 95'],
			row.names = row.names(coefs))
	} else {
		stop('Unable to handle format of summary.mipo object')
	}
	# Whether this confidence interval contains the true hazard ratio
	out$cover <- kLogHR >= out$lo95 & kLogHR <= out$hi95
	out
}

domissf <- function(missdata, reps = NIMPS){
	# Imputation by missForest
	out <- list()
	for (i in 1:reps){
		invisible(capture.output(
			out[[i]] <- missForest(missdata)$ximp))
	}
	out
}

domice <- function(missdata, functions, reps = NIMPS){
	mids <- mice(missdata, defaultMethod = functions,
		m = reps, visitSequence = 'monotone',
		printFlag = FALSE, maxit = 10)
	lapply(1:reps, function(x) complete(mids, x))
}

doanalysis <- function(x){
	# Creates a dataset, analyses it using different methods, and outputs
	# the result as a matrix of coefficients / SE and coverage 
	data <- makeSurv(kSampleSize)
	missdata <- makeMarSurv(data)
	out <- list()
	out$full <- coxfull(data)
	out$missf <- coximpute(domissf(missdata))
	out$rf5 <- coximpute(domice(missdata, 'rfcont5'))
	out$rf10 <- coximpute(domice(missdata, 'rfcont10'))
	out$rf20 <- coximpute(domice(missdata, 'rfcont20'))
	out$rf100 <- coximpute(domice(missdata, 'rfcont100'))
	out$rfdoove10 <- coximpute(domice(missdata, 'rfdoove10'))
	out$rfdoove100 <- coximpute(domice(missdata, 'rfdoove100'))
	out$cart <- coximpute(domice(missdata, 'cart'))
	out$mice <- coximpute(domice(missdata, 'norm'))
	out
}


###################################################
### code chunk number 3: simstudy_survival.Rnw:284-290
###################################################
# Chunk 3

mydata <- makeSurv(200)
plot(mydata[, c('x1', 'x2', 'x3')],
	main = "Associations between predictor variables in a sample dataset")
mydata <- makeSurv(20000)


###################################################
### code chunk number 4: simstudy_survival.Rnw:295-298
###################################################
# Chunk 4

summary(lm(x3 ~ x1*x2, data = mydata))


###################################################
### code chunk number 5: simstudy_survival.Rnw:301-314
###################################################
# Chunk 5

mydata <- makeSurv(2000)
mydata2 <- makeMarSurv(mydata)
# Plot non-missing data
plot(mydata$x1[!is.na(mydata2$x3)], mydata$x3[!is.na(mydata2$x3)],
	pch = 19, xlab = 'x1', ylab = 'x3')
# Plot missing data
points(mydata$x1[is.na(mydata2$x3)], mydata$x3[is.na(mydata2$x3)],
	col = 'red', pch = 19)
legend('bottomright', legend = c('x3 observed', 'x3 missing'),
	col = c('black', 'red'), pch = 19)
title('Association of predictor variables x1 and x3')


###################################################
### code chunk number 6: simstudy_survival.Rnw:319-345
###################################################
# Chunk 6

# Cox proportional hazards analysis
myformula <- as.formula(Surv(time, event) ~ x1 + x2 + x3)

# Analysis with 10,000 simulated patients (or more
# if the variable REFERENCE_SAMPLESIZE exists)
if (!exists('REFERENCE_SAMPLESIZE')){
	REFERENCE_SAMPLESIZE <- 10000
}

# Use parallel processing, if available, to create
# datasets more quickly.
if ('parallel' %in% loadedNamespaces() &&
	!is.null(getOption('mc.cores')) &&
	.Platform$OS.type == 'unix'){
	REFERENCE_SAMPLESIZE <- REFERENCE_SAMPLESIZE %/%
		getOption('mc.cores')
	simdata <- parallel::mclapply(1:getOption('mc.cores'),
		function(x) makeSurv(REFERENCE_SAMPLESIZE))
	simdata <- do.call('rbind', simdata)
} else {
	simdata <- makeSurv(REFERENCE_SAMPLESIZE)
}

summary(coxph(myformula, data = simdata))


###################################################
### code chunk number 7: simstudy_survival.Rnw:367-387
###################################################
# Chunk 7

# Setting analysis parameters: To analyse more than 3 samples,
# set N to the desired number before running this program
if (!exists('N')){
	N <- 3
}
# Number of imputations (set to at least 10 when
# running an actual simulation)
if (!exists('NIMPS')){
	NIMPS <- 4
}
# Use parallel processing if the 'parallel' package is loaded
if ('parallel' %in% loadedNamespaces() &&
	.Platform$OS.type == 'unix'){
	cat('Using parallel processing\n')
	results <- parallel::mclapply(1:N, doanalysis)
} else {
	results <- lapply(1:N, doanalysis)
}


###################################################
### code chunk number 8: simstudy_survival.Rnw:416-455
###################################################
# Chunk 8

getParams <- function(coef, method){
	estimates <- sapply(results, function(x){
		x[[method]][coef, 'est']
	})
	bias <- mean(estimates) - kLogHR
	se_bias <- sd(estimates) / sqrt(length(estimates))
	mse <- mean((estimates - kLogHR) ^ 2)
	ci_len <- mean(sapply(results, function(x){
		x[[method]][coef, 'hi95'] - x[[method]][coef, 'lo95']
	}))
	ci_cov <- mean(sapply(results, function(x){
		x[[method]][coef, 'cover']
	}))
	out <- c(bias, se_bias, mse, sd(estimates), ci_len, ci_cov)
	names(out) <- c('bias', 'se_bias', 'mse', 'sd', 'ci_len', 'ci_cov')
	out
}

showTable <- function(coef){
	methods <- c('full', 'missf', 'cart', 'rfdoove10',
		'rfdoove100', 'rf5', 'rf10', 'rf20', 'rf100', 'mice')
	methodnames <- c('Full data', 'missForest', 'CART MICE',
		'RF Doove MICE 10', 'RF Doove MICE 100',
		paste('RFcont MICE', c(5, 10, 20, 100)),
		'Parametric MICE')
	out <- t(sapply(methods, function(x){
		getParams(coef, x)
	}))
	out <- formatC(out, digits = 3, format = 'fg')
	out <- rbind(c('', 'Standard', 'Mean', 'SD of', 'Mean 95%',
		'95% CI'), c('Bias', 'error of bias', 'square error', 'estimate',
		'CI length', 'coverage'), out)
	out <- cbind(c('', '', methodnames), out)
	rownames(out) <- NULL
	print(xtable(out), floating = FALSE, include.rownames = FALSE,
		include.colnames = FALSE, hline.after = c(0, 2, nrow(out)))
}


###################################################
### code chunk number 9: simstudy_survival.Rnw:468-471
###################################################
# Chunk 9

showTable('x1')


###################################################
### code chunk number 10: simstudy_survival.Rnw:480-483
###################################################
# Chunk 10

showTable('x2')


###################################################
### code chunk number 11: simstudy_survival.Rnw:493-496
###################################################
# Chunk 11

showTable('x3')


###################################################
### code chunk number 12: simstudy_survival.Rnw:506-530
###################################################
# Chunk 12

numtrees <- c(5, 10, 20, 100)
bias <- sapply(numtrees, function(x){
	getParams('x3', paste('rf', x, sep=''))['bias']
})
se_bias <- sapply(numtrees, function(x){
	getParams('x3', paste('rf', x, sep=''))['se_bias']
})
lower_bias <- bias - 1.96*se_bias
upper_bias <- bias + 1.96*se_bias

# Blank plot
plot(-100, 0, type = 'p', pch = 15, cex = 1.3, ylab = 'Bias', 
	xlab = 'Number of trees', xlim = c(0,100),
	ylim = c(min(lower_bias), max(upper_bias)))
# Zero bias line
lines(c(0,100), c(0,0), lty = 2, col = 'gray')
# Confidence interval lines
for (i in 1:5){lines(rep(numtrees[i], 2),
	c(lower_bias[i], upper_bias[i]))}
# Points
points(numtrees, bias, pch = 15, cex = 1.3)
title('Bias in estimate of x3 coefficient after\nmultiple imputation using RFcont MICE')


###################################################
### code chunk number 13: simstudy_survival.Rnw:535-690
###################################################
# Chunk 13

# Comparing confidence interval coverage and bias between:
#    RF MICE 100 trees
#    RF MICE 10 trees
#    Parametric MICE

# Names of the variables in the comparison
variables <- c('x1', 'x2', 'x3')

pstar <- function(x){
	if (!is.na(x)){
		if (x < 0.001){
			'***'
		} else if (x < 0.01){
			'**'
		} else if (x < 0.05){
			'*'
		} else {
			''
		}
	} else {
		''
	}
}

compareBias <- function(method1, method2){
	# Generates a table comparing bias
	# Comparison statistic is the difference in absolute bias
	# (negative means first method is better)
	
	compareBiasVar <- function(varname){
		# All coefficients should be kLogHR
		bias1 <- sapply(results, function(x){
			x[[method1]][varname, 'est']
		}) - kLogHR
		bias2 <- sapply(results, function(x){
			x[[method2]][varname, 'est']
		}) - kLogHR

		if (sign(mean(bias1)) == -1){
			bias1 <- -bias1
		}
		if (sign(mean(bias2)) == -1){
			bias2 <- -bias2
		}
		
		paste(formatC(mean(bias1) - mean(bias2), format = 'fg', digits = 3),
			pstar(t.test(bias1 - bias2)$p.value))
	}
	
	sapply(variables, compareBiasVar)
}

compareVariance <- function(method1, method2){
	# Generates a table comparing precision between two methods
	# Comparison statistic is ratio of variance
	# (smaller means first method is better)
	
	compareVarianceVar <- function(varname){
		e1 <- sapply(results, function(x){
			x[[method1]][varname, 'est']
		})
		e2 <- sapply(results, function(x){
			x[[method2]][varname, 'est']
		})
		paste(formatC(var(e1) / var(e2), format = 'fg', digits = 3),
			pstar(var.test(e1, e2)$p.value))
	}
	
	sapply(variables, compareVarianceVar)
}

compareCIlength <- function(method1, method2){
	# Generates a table comparing coverage percentage between two methods
	# Comparison statistic is the ratio of confidence interval lengths
	# (less than 1 = first better)
	
	compareCIlengthVar <- function(varname){
		# Paired t test for bias (difference in estimate)
		len1 <- sapply(results, function(x){
			x[[method1]][varname, 'hi95'] -
				x[[method1]][varname, 'lo95']
		})
		len2 <- sapply(results, function(x){
			x[[method2]][varname, 'hi95'] -
				x[[method2]][varname, 'lo95']
		})
		
		paste(formatC(mean(len1) / mean(len2),
			format = 'fg', digits = 4),
			pstar(t.test(len1 - len2)$p.value))
	}
	
	sapply(variables, compareCIlengthVar)	
}

compareCoverage <- function(method1, method2){
	# Generates a table comparing coverage percentage between two methods
	# Comparison statistic is the difference in coverage
	# (positive = first better)

	compareCoverageVar <- function(varname){
		# Paired t test for bias (difference in estimate)
		
		cov1 <- sapply(results, function(x){
			x[[method1]][varname, 'cover']
		})
		cov2 <- sapply(results, function(x){
			x[[method2]][varname, 'cover']
		})
				
		paste(formatC(100 * (mean(cov1) - mean(cov2)), format = 'f',
			digits = 1),
			pstar(binom.test(c(sum(cov1 == TRUE  & cov2 == FALSE),
			sum(cov1 == FALSE & cov2 == TRUE)))$p.value))
	}
	
	sapply(variables, compareCoverageVar)	
}

maketable <- function(comparison){
	# comparison is a function such as compareCoverage, compareBias
	compare <- cbind(comparison('rf10', 'mice'),
		comparison('rf100', 'mice'),
		comparison('rf10', 'rf100'))
	compare <- cbind(rownames(compare), compare)
	compare <- rbind(
		c('', 'RFcont MICE 10 vs', 'RFcont MICE 100 vs',
			'RFcont MICE 10 vs'),
		c('Coefficient', 'parametric MICE',
			'parametric MICE', 'RFcont MICE 100'),
		compare)
	rownames(compare) <- NULL
	print(xtable(compare), include.rownames = FALSE,
		include.colnames = FALSE, floating = FALSE,
		hline.after = c(0, 2, nrow(compare)))
	
	cat('\n\\vspace{1em}\n')
	
	compare <- cbind(comparison('rfdoove10', 'rf10'),
		comparison('rfdoove10', 'cart'),
		comparison('rfdoove10', 'rfdoove100'))
	compare <- cbind(rownames(compare), compare)
	compare <- rbind(
		c('', 'RF Doove MICE 10 vs', 'RF Doove MICE 10 vs',
			'RF Doove MICE 10 vs'),
		c('Coefficient', 'RFcont MICE 10',
			'CART MICE', 'RF Doove MICE 100'),
		compare)
	rownames(compare) <- NULL
	print(xtable(compare), include.rownames = FALSE,
		include.colnames = FALSE, floating = FALSE,
		hline.after = c(0, 2, nrow(compare)))
}


###################################################
### code chunk number 14: simstudy_survival.Rnw:699-702
###################################################
# Chunk 14

maketable(compareBias)


###################################################
### code chunk number 15: simstudy_survival.Rnw:711-714
###################################################
# Chunk 15

maketable(compareVariance)


###################################################
### code chunk number 16: simstudy_survival.Rnw:724-727
###################################################
# Chunk 16

maketable(compareCIlength)


###################################################
### code chunk number 17: simstudy_survival.Rnw:736-739
###################################################
# Chunk 17

maketable(compareCoverage)


###################################################
### code chunk number 18: simstudy_survival.Rnw:773-784
###################################################
# Chunk 18

showfunction <- function(functionname){
	cat(paste(functionname, '<-',
		paste(capture.output(print(get(functionname))),
		collapse = '\n')))
	cat('\n')
	invisible(NULL)
}
showfunction('makeSurv')
showfunction('makeMarSurv')


###################################################
### code chunk number 19: simstudy_survival.Rnw:789-803
###################################################
# Chunk 19

showfunction('coxfull')
showfunction('coximpute')
showfunction('domissf')
showfunction('mice.impute.cart')
showfunction('mice.impute.rfdoove10')
showfunction('mice.impute.rfdoove100')
showfunction('mice.impute.rfcont5')
showfunction('mice.impute.rfcont10')
showfunction('mice.impute.rfcont20')
showfunction('mice.impute.rfcont100')
showfunction('domice')
showfunction('doanalysis')


###################################################
### code chunk number 20: simstudy_survival.Rnw:808-815
###################################################
# Chunk 20

showfunction('pstar')
showfunction('compareBias')
showfunction('compareVariance')
showfunction('compareCIlength')
showfunction('compareCoverage')


###################################################
### code chunk number 21: simstudy_survival.Rnw:820-825
###################################################
# Chunk 21

showfunction('getParams')
showfunction('showTable')
showfunction('maketable')

Try the CALIBERrfimpute package in your browser

Any scripts or data that you put into this service are public.

CALIBERrfimpute documentation built on Dec. 5, 2022, 1:07 a.m.