## this file is for running ODAC assuming heterogeous baseline hazard among sites
## please update ODACO package to v2.0,
## DistCox() now accept input data as data frame
## please make the first 3 columns of the input data as: site id, time, status
## and the rest numeric covariates
## I'm still using the example of AMI / Stroke
## Db.names <- c('ccae', 'Jmdc', 'mdcd', 'optum')
## please also specify the var names for (time, status, X1, X2) in the script below
## output are .csv files of beta and var est, and also a pdf of cumulative hazards plot
require(KernSmooth)
Db.names <- c('ccae', 'mdcr', 'mdcd', 'optum') # 'Jmdc',
evaluateCoxHetero <- function(studyFolder, data, outcomeName,
sub.prop=1, sub.min=30){ # proportion of a subsample
# the first 3 columns of the input data frame are: site id, time, status
# and the rest are covariates (numeric)
n.total <- nrow(data)
Db.names <- unique(data$database)
## randomly select a subsample of each site, require number of cases > sub.min
tab.site.case <- matrix(NA, length(Db.names), 3)
if(sub.prop < 1){
id.sub <- c()
for(idb in 1:length(Db.names)){
id.Db <- which(data$database == Db.names[idb])
tab.site.case[idb, ] <- c(Db.names[idb], sum(data$y[id.Db]), length(id.Db))
# if(sum(data$y[id.Db])*sub.prop < sub.min){
# warning(paste0(Db.names[idb], ' too small to subsample, removed!'))
# next
# }
# nsub <- round(length(id.Db)*sub.prop)
## sub.prop adaptive for each site so that the avg sampled cases in each site is sub.min
nsub <- round(length(id.Db)* sub.min / sum(data$y[id.Db]) )
id.sub1 <- sample(id.Db, nsub, replace = F)
iss <- 1
while(sum(data$y[id.sub1]) < sub.min & iss < 10){
id.sub1 <- sample(id.Db, nsub, replace = F)
iss <- iss + 1
}
id.sub <- c(id.sub, id.sub1)
}
data <- data[id.sub,]
}
colnames(tab.site.case) <- c(outcomeName, 'n.cases', 'n.subjects')
print(tab.site.case)
write.csv(tab.site.case, file.path(studyFolder, sprintf("tab.site.case_%s.csv", outcomeName)), row.names = FALSE)
n.total <- nrow(data)
if (n.total == 0) {
warning("No data left after sampling. Terminating")
return()
}
Db.names <- unique(as.character(data$database))
colnames(data) <- gsub("=", "", colnames(data))
predictors <- colnames(data)
predictors <- predictors[!(predictors %in% c("time", "y", "database"))]
px <- length(predictors)
form <- as.formula(paste("Surv(time, y) ~", paste(predictors, collapse = " + ")))
pooledCox <- coxph(form, data=data)
## stratified Cox will be used as golden standard
stratForm <- as.formula(paste("Surv(time, y) ~", paste(predictors, collapse = " + "), "+ strata(database)"))
stratCox <- coxph(stratForm, data=data)
# get localCox
local.all <- list()
time.max <- H0.max <- h0.max <- 0
sum_inv_var <- matrix(0, px, px)
sum_inv_var_b <- rep(0, px)
for(idb in 1:length(Db.names)){
localDb <- Db.names[idb]
cat(localDb, '...\n')
# Now don't need to transfer data frame to list
localCox <- coxph(form, data[data$database == localDb, ])
# prepare weighted est
sum_inv_var <- sum_inv_var + solve(localCox$var)
sum_inv_var_b <- sum_inv_var_b + solve(localCox$var, localCox$coef)
local.all[[localDb]]$localCox <- localCox
local.all[[localDb]]$n.localDb <- sum(data$database == localDb)
# cumulative baseline haz
H0 <- basehaz(localCox, centered=FALSE)
local.all[[localDb]]$H0.localCox <- H0
time.max <- max(time.max, max(H0$time))
H0.max <- max(H0.max, max(H0$hazard))
# set a common beta to calculate basehaz at each site, must be $coefficients, not $coef
localCox$coefficients <- stratCox$coef
H0 <- basehaz(localCox, centered=FALSE)
local.all[[localDb]]$H0.stratCox <- H0
# get baseline haz, keep h0 only at event times (h0=0 at censored times)
h0 <- diff(H0$hazard)
time0 <- H0$time
time0 <- time0[-length(time0)]
time0 = time0[h0!=0]
h0 = h0[h0!=0]
local.all[[localDb]]$h0 <- list(y=h0, x=time0)
# kernal smoothing h0
local.all[[localDb]]$h0.ks.20 <- ksmooth(time0, h0, bandwidth = 20)
local.all[[localDb]]$h0.ks.50 <- ksmooth(time0, h0, bandwidth = 50)
local.all[[localDb]]$h0.ks.normal <- ksmooth(time0, h0, bandwidth = 50, kernel = 'normal')
h0.max <- max(h0.max, max(h0))
# ss <- survfit(localCox)
}
# get avgCox: inverse-variance (i.e. hessian) weighted est
var_avgCox <- solve(sum_inv_var)
beta_avgCox <- var_avgCox %*% (sum_inv_var_b)
# plot the cumulative hazard curve of each site
# pdf(file.path(studyFolder, sprintf('cumhaz_all_sites_localCox_%s.pdf', outcomeName)))
# for(idb in 1:length(Db.names)){
# localDb <- Db.names[idb]
# if(localDb==Db.names[1])
# plot(hazard ~ time, data=local.all[[localDb]]$H0.localCox, type='l',
# xlim=c(0, time.max), ylim=c(0, H0.max),
# xlab='time', ylab='cumhaz', main=paste0(outcomeName, ', beta_localCox') )
# else
# lines(hazard ~ time, data=local.all[[localDb]]$H0.localCox, col=idb, lty=idb)
#
# }
# legend('topleft', legend=Db.names, # x=0.1*time.max, y=0.9*H0.max
# cex=.8, col=1:length(Db.names), lty=1:length(Db.names))
# dev.off()
pdf(file.path(studyFolder, sprintf('basehaz_stratCox_%s.pdf', outcomeName)))
for(idb in 1:length(Db.names)){
localDb <- Db.names[idb]
if(localDb==Db.names[1])
# plot(hazard ~ time, data=local.all[[localDb]]$H0.stratCox, type='l',
# xlim=c(0, time.max), ylim=c(0, H0.max),
# xlab='time', ylab='cumhaz', main=paste0(outcomeName, ', beta_stratCox'))
plot(local.all[[localDb]]$h0$y ~ local.all[[localDb]]$h0$x, type='s',
xlim=c(0, time.max), ylim=c(0, h0.max),
xlab='time', ylab='baseline hazard', main=paste0(outcomeName, ', beta_stratCox'))
else
lines(local.all[[localDb]]$h0$y ~ local.all[[localDb]]$h0$x, type='s',
col=idb, lty=idb)
}
legend('topleft', legend=Db.names,
cex=.8, col=1:length(Db.names), lty=1:length(Db.names))
dev.off()
## ksmooth(bandwidth = 20), box kernel
pdf(file.path(studyFolder, sprintf('basehaz_ks20_stratCox_%s.pdf', outcomeName)))
for(idb in 1:length(Db.names)){
localDb <- Db.names[idb]
if(localDb==Db.names[1])
plot(local.all[[localDb]]$h0.ks.20$y ~ local.all[[localDb]]$h0.ks.20$x, type='l',
xlim=c(0, time.max), ylim=c(0, h0.max),
xlab='time', ylab='baseline hazard', main=paste0(outcomeName, ', beta_stratCox (ksmooth, bw=20)'))
else
lines(local.all[[localDb]]$h0.ks.20$y ~ local.all[[localDb]]$h0.ks.20$x, type='l',
col=idb, lty=idb)
}
legend('topleft', legend=Db.names,
cex=.8, col=1:length(Db.names), lty=1:length(Db.names))
dev.off()
## ksmooth(bandwidth = 50)
pdf(file.path(studyFolder, sprintf('basehaz_ks50_stratCox_%s.pdf', outcomeName)))
for(idb in 1:length(Db.names)){
localDb <- Db.names[idb]
if(localDb==Db.names[1])
plot(local.all[[localDb]]$h0.ks.50$y ~ local.all[[localDb]]$h0.ks.50$x, type='l',
xlim=c(0, time.max), ylim=c(0, h0.max),
xlab='time', ylab='baseline hazard', main=paste0(outcomeName, ', beta_stratCox (ksmooth, bw=50)'))
else
lines(local.all[[localDb]]$h0.ks.50$y ~ local.all[[localDb]]$h0.ks.50$x, type='l',
col=idb, lty=idb) #
}
legend('topleft', legend=Db.names,
cex=.8, col=1:length(Db.names), lty=1:length(Db.names))
dev.off()
## ksmooth(bandwidth = 50) using normal kernel
pdf(file.path(studyFolder, sprintf('basehaz_ks50_normal_stratCox_%s.pdf', outcomeName)))
for(idb in 1:length(Db.names)){
localDb <- Db.names[idb]
if(localDb==Db.names[1])
plot(local.all[[localDb]]$h0.ks.normal$y ~ local.all[[localDb]]$h0.ks.normal$x, type='l',
xlim=c(0, time.max), ylim=c(0, h0.max),
xlab='time', ylab='baseline hazard', main=paste0(outcomeName, ', beta_stratCox (ksmooth, bw=50, normal)'))
else
lines(local.all[[localDb]]$h0.ks.normal$y ~ local.all[[localDb]]$h0.ks.normal$x, type='l',
col=idb, lty=idb) #
}
legend('topleft', legend=Db.names,
cex=.8, col=1:length(Db.names), lty=1:length(Db.names))
dev.off()
## using each Db as Local and run DistCox
## now assume heterogeneous baseline hazards
# Reformat data:
sites <- unique(data$database)
sites <- data.frame(database = sites,
id = 1:length(sites),
stringsAsFactors = FALSE)
reformData <- merge(data, sites)
reformData <- reformData[, c("id", "time", "y", predictors)]
for (localId in sites$id) {
localDb <- sites$database[sites$id == localId]
cat(localDb, '...\n')
# proposed DistCox, assume common baseline hazards (this is already studied...)
distCoxAvgInit <- ODACO::DistCox(mydata = reformData,
id.local = localId,
init_est = beta_avgCox,
output.ODACO1 = T,
strat = F)
# proposed DistCox, assume heterogeneous baseline hazards
distCoxAvgInit.H <- ODACO::DistCox(mydata = reformData,
id.local = localId,
init_est = beta_avgCox,
output.ODACO1 = T,
strat = T)
# assign(paste0('distCoxAvgInit.H.', localDb), distCoxAvgInit.H)
tmp = rbind(local.all[[localDb]]$localCox$coef,
c(beta_avgCox),
c(distCoxAvgInit$beta_tilde1),
c(distCoxAvgInit$beta_tilde),
c(distCoxAvgInit.H$beta_tilde1),
c(distCoxAvgInit.H$beta_tilde),
pooledCox$coef,
stratCox$coef,
local.all[[localDb]]$n.localDb)
results <- data.frame(description = c("Local",
"Meta",
"ODAC1",
"ODAC2",
"ODACH1",
"ODACH2",
"pooledCox",
"stratCox",
"sampleSize"), tmp)
write.csv(results, file.path(studyFolder, sprintf("%s_beta_%s.csv", localDb, outcomeName)), row.names = FALSE)
res.var <- rbind(local.all[[localDb]]$localCox$var,
var_avgCox,
distCoxAvgInit$sol1$hessian,
distCoxAvgInit$sol$hessian,
distCoxAvgInit.H$sol1$hessian,
distCoxAvgInit.H$sol$hessian,
pooledCox$var,
stratCox$var )
write.csv(res.var, file.path(studyFolder, sprintf("%s_var_%s.csv", localDb, outcomeName)), row.names = FALSE)
}
}
evaluateCoxHeteroAllOutcomes <- function(){
library(survival)
studyFolder <- "r:/DistributedCoxHeteroEval"
pathToCsv <- system.file("settings", "KnownPredictors.csv", package = "DistributedRegressionEval")
outcomes <- read.csv(pathToCsv)
outcomes <- outcomes[!duplicated(outcomes$outcomeId), c("outcomeId", "outcomeName")]
for (outcomeId in outcomes$outcomeId) {
# outcomeId <- 6 # 3 = stroke, 5 = AMI
outcomeName <- outcomes$outcomeName[outcomes$outcomeId == outcomeId]
writeLines(paste("Evaluating outcome", outcomeName))
data <- readRDS(file.path(studyFolder, sprintf("data_o%s.rds", outcomeId)))
data$age_in_years <- (data$age_in_years - 50) / 20
# if (outcomeId == 3 | outcomeId == 6) {
data <- data[data$database != "Jmdc", ]
# }
# Remove for stroke model:
# if (outcomeId == 3) {
# data$age_in_years <- NULL
# data$`gender_=_FEMALE` <- NULL
# data$Major_depressive_disorder <- NULL
# }
if (outcomeId == 4) {
data$Endocarditis <- NULL
}
evaluateCoxHetero(file.path(studyFolder, "subProp01"), data, outcomeName, sub.prop = 0.1, sub.min =20)
evaluateCoxHetero(file.path(studyFolder, "subProp02"), data, outcomeName, sub.prop = 0.1, sub.min =15)
evaluateCoxHetero(file.path(studyFolder, "subProp03"), data, outcomeName, sub.prop = 0.1, sub.min =10)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.