# Libraries library(futile.logger) library(parallel) library(glmnet) library(loose.rock) library(digest) library(ggplot2) library(survminer) library(reshape2) library(survival) library(brca.data) library(Vennerable) library(limma) library(tidyverse) library(forcats) library(survival.owl) library(biclust) # library(glmSparseNet) require(doMC) registerDoMC(cores=params$n.cores) #devtools::install_github('averissimo/glmnet') # devtools::load_all() # .Last.value <- base.dir(path = '/ssd_home/averissimo/work/rpackages/network.cox-cache') .Last.value <- show.message(FALSE) .Last.value <- flog.layout(layout.format('[~l ~t] ~m')) .Last.value <- flog.appender(appender.tee(file.path('logs', sprintf('logger-optimal-%s.txt', format(Sys.time(), '%Y%m%d-%Hh%Mm%S'))))) theme_set(theme_minimal())
prepare.tcga.survival.data.old
functioncat("```\n") cat("# Number of resampling folds: ", params$seed.additional, '\n', sep = '') cat('prepare.tcga.survival.data(\'', params$project, '\',\n', sep = '') cat(' \'', params$tissue, '\',\n', sep = '') cat(' normalization = \'', params$normalize, '\',\n', sep = '') cat(' log2.pre.normalize = ', params$log2, ',\n', sep = '') cat(' handle.duplicates = \'', params$handle.duplicates, '\',\n', sep = '') cat(' coding.genes = ', params$coding.genes, ')\n', sep = '') cat("```\n")
my.data <- run.cache(prepare.tcga.survival.data, params$project, params$tissue, # #input.type = 'rna', normalization = params$normalize, log2.pre.normalize = params$log2, # handle.duplicates = params$handle.duplicates, coding.genes = params$coding.genes, subtract.surv.column = params$subtract.surv.column, cache.prefix = 'tcga-data.new', show.message = T) # clinical <- my.data$clinical # xdata <- my.data$xdata ydata <- my.data$ydata xdata.raw <- my.data$xdata.raw # # xdata.digest.cache <- my.data$xdata.digest xdata.raw.digest.cache <- my.data$xdata.raw.digest ydata.digest.cache <- my.data$ydata.digest # rm(my.data)
ggsurv <- survfit(Surv(time, status) ~ 1, data = ydata %>% mutate(time = time/365*12)) %>% ggsurvplot(data = ydata, conf.int = FALSE, risk.table = TRUE, cumcensor = TRUE, xlab = "Time in months", tables.height = .1, surv.median.line = "hv", # add the median survival pointer. break.x.by = 12, #ncensor.plot = TRUE, ggtheme = theme_minimal()) # ggsurv$plot <- ggsurv$plot + theme(legend.position = 'none') #+ scale_x_continuous('x', breaks = seq(0, max((ydata %>% mutate(time = time/365*12))$time) + 20, 20)) ggsurv$ncensor.plot <- ggpar(ggsurv$ncensor.plot, font.y = c(0, "bold.italic", "darkgreen")) ggsurv$table <- ggpar(ggsurv$table, font.y = c(0, "bold.italic", "darkgreen")) ggsurv
Using r params$degree.type
network
if (params$degree.type == 'oldie') { # # Load old matrix used in SPARSA conference load(sprintf('/home/averissimo/work/rpackages/brca.analysis/data/degree-%.6f.RData', params$degree.cutoff)) } else if (params$degree.type == 'correlation') { # # Load degree of network calculated summing the inverse of each weight degree <- degree.cor(xdata.raw, consider.unweighted = params$degree.unweighted, cutoff = params$degree.cutoff, method = params$degree.correlation, n.cores = params$n.cores) } else if (params$degree.type == 'covariance') { # # Load degree of network calculated using covariance degree <- degree.cov(xdata.raw, consider.unweighted = params$degree.unweighted, cutoff = params$degree.cutoff, method = params$degree.correlation, n.cores = params$n.cores) } else if (params$degree.type == 'sparsebn') { # # Load degree of network from sparsebn package (calulate bayesian network) degree <- degree.sparsebn(xdata.raw, cutoff = params$degree.cutoff, consider.unweighted = params$degree.unweighted, n.cores = params$n.cores) } else if (params$degree.type == 'string') { # # Load degree of STRING network downloaded from external sources degree <- run.cache(stringDBhomoSapiens, cache.prefix = "stringDB") %>% { run.cache(buildStringNetwork, ., use.names = 'ensembl', cache.prefix = 'string-network') } %>% { .@x[.@x <= params$degree.cutoff] <- 0 if (params$degree.unweighted) { .@x <- (.@x != 0) * 1 } Matrix::colSums(.) + Matrix::rowSums(.) } # # Adds missing genes as nodes with 0 degree degree <- colnames(xdata.raw)[!colnames(xdata.raw) %in% names(degree)] %>% { c(degree, array(0, length(.), dimnames = list(.))) } %>% { .[colnames(xdata.raw)] } } else if (params$degree.type == 'string.pedro') { # # Load degree of STRING network downloaded from external sources load('/ssd_home/averissimo/work/rpackages/network.cox-big.files/saves/degree-string.RData') ix.names <- colnames(xdata) %in% names(degree_uw) degree <- degree_uw[colnames(xdata)] }
trans.fun
is a double power to scale the valuessee ?glmSparseNet::heuristic.scale
or ?glmSparseNet::hub.heuristic
# see ?glmSparseNet::heuristic.scale trans.fun <- function(x) { heuristicScale(x) + 0.2}
original.penalty.factor <- degree names(original.penalty.factor) <- colnames(xdata) ########################## # # # SUPER IMPORTANT!!!! # # # ########################## original.penalty.factor[is.na(original.penalty.factor)] <- 0 norm.orig.penalty.factor <- original.penalty.factor / max(original.penalty.factor[!is.na(original.penalty.factor)]) # # # DegreeCox (old and log(old)) # penalty.factor.degree.log <- penalty.factor.degree.old <- 1 / norm.orig.penalty.factor inf.ix <- is.infinite(penalty.factor.degree.old) # index for infinite values # log(old) log.penal <- log(penalty.factor.degree.old[!inf.ix]) + 1 penalty.factor.degree.log[!inf.ix] <- log.penal penalty.factor.degree.log[inf.ix] <- max(log.penal) + 1 # old non.log <- penalty.factor.degree.old[!inf.ix] penalty.factor.degree.old[inf.ix] <- max(non.log) + 1 # # DegreeCox heuristic # penalty.factor.degree.new <-trans.fun(1 - norm.orig.penalty.factor) # # OrphanCox # penalty.factor.orphan <- trans.fun(norm.orig.penalty.factor)
Testing with cross-validation to find optimal number of variables for comparisson models.
alpha.models <- tibble(alpha = numeric(), seed = numeric(), glmnet = list(), hub = list(), orphan = list()) for (ix.seed in seq(params$seed, params$seed + params$seed.additional)) { flog.info('Starting run with seed %d', ix.seed) for (ix.alpha in params$alpha) { #aux.call <- function(xdata, ydata, network, alpha, seed) { my.cv.glmnet(alpha, xdata, ydata, network, parallel = TRUE, seed = seed) } aux.call <- function(xdata, ydata, network, alpha, seed) { my.cv.glmnet(alpha, xdata, ydata, network, parallel = params$n.cores, seed = seed) } cv.classic <- run.cache(aux.call, xdata, ydata, rep(1, ncol(xdata)), ix.alpha, ix.seed, cache.prefix = 'cv.glmSparseNet.elastic.net.glmnet', cache.digest = list(xdata.digest.cache)) flog.info(' - Finished elastic net (%d)', sum(cv.classic$coef != 0)) cv.hub <- run.cache(aux.call, xdata, ydata, penalty.factor.degree.new, ix.alpha, ix.seed, cache.prefix = 'cv.glmSparseNet.hub.glmnet', cache.digest = list(xdata.digest.cache)) flog.info(' - Finished hub (%d)', sum(cv.hub$coef != 0)) cv.orphan <- run.cache(aux.call, xdata, ydata, penalty.factor.orphan, ix.alpha, ix.seed, cache.prefix = 'cv.glmSparseNet.orphan.glmnet', cache.digest = list(xdata.digest.cache)) flog.info(' - Finished orphan (%d)', sum(cv.orphan$coef != 0)) new.line <- tibble(alpha = ix.alpha, seed = ix.seed, glmnet = list(cv.classic$model), hub = list(cv.hub$model), orphan = list(cv.orphan$model)) alpha.models <- rbind(alpha.models, new.line) save(new.line, file = sprintf('cv.value/cv.value_%s_xdata-%s_ydata-%s', loose.rock::digest.cache(new.line), xdata.digest.cache, loose.rock::digest.cache(ydata))) flog.info(' * Finished run with alpha %.3f', ix.alpha) } }
sel.all <- apply(alpha.models[,-2], 1:2, function(ix.name) { if (is.numeric(ix.name[[1]])) { return(ix.name[[1]]) } else { best.coef <- coef(ix.name[[1]], s = 'lambda.min')[,1] return(names(best.coef[best.coef != 0])) } }) %>% as.data.frame %>% as.tbl sel.all$alpha <- as.numeric(sel.all$alpha) sel.all %>% as.data.frame %>% group_by(alpha) %>% summarise_all(funs(mean = mean(sapply(., length)), sd = sd(sapply(., length)), unique = length(unique(unlist(.))))) %>% knitr::kable()
alpha
parameterIndividuals are separated by the coefficients estimated by the model in two groups with high and low risk of the event (death). The median is then used.
km.all <- apply(alpha.models[,-2], 1:2, function(ix.name) { if (is.numeric(ix.name[[1]])) { return(ix.name[[1]]) } else { best.coef <- coef(ix.name[[1]], s = 'lambda.min')[,1] if (sum(best.coef[best.coef != 0]) == 0) { return(NA) } return(separate2GroupsCox(best.coef, xdata, ydata)$pvalue) } }) km.all %>% as.data.frame %>% group_by(alpha) %>% summarise_all(funs(mean = mean(., na.rm = TRUE), SE = sd(., na.rm = TRUE))) %>% knitr::kable()
km.all %>% as.data.frame %>% melt.data.frame(id.vars = 'alpha') %>% mutate(Alpha = factor(alpha), Model = variable) %>% ggplot(aes(x = Alpha, y = value, color = Model)) + geom_boxplot(alpha = .5) + # geom_jitter(width = 0.2, size = .5) + # expand_limits(y = .0025) + ggtitle('Kaplan-Meier estimator (per alpha parameter)', subtitle = 'Two groups (High/Low risk of event) are estimated based on Cox model\'s coefficients')
alpha
parameterConcordance C-Index calculates a value between 0 and 1 that represents the pairwise concordance between the survival time and the relative-risk calculated from the Cox model. In other words, it assesses for each pair of individuals if individuals with lower risk survive longer than individuals with higher risks.
c.index.all <- apply(alpha.models[,-2], 1:2, function(ix.name) { if (is.numeric(ix.name[[1]])) { return(ix.name[[1]]) } else { best.coef <- coef(ix.name[[1]], s = 'lambda.min')[,1] if (sum(best.coef[best.coef != 0]) == 0) { return(NA) } return(fit.risk(best.coef, xdata, ydata, NULL)) } }) c.index.all %>% as.data.frame %>% group_by(alpha) %>% summarise_all(funs(mean = mean(., na.rm = TRUE), SE = sd(., na.rm = TRUE))) %>% knitr::kable()
c.index.all %>% as.data.frame %>% melt.data.frame(id.vars = 'alpha') %>% mutate(Alpha = factor(alpha), Model = variable) %>% ggplot(aes(x = Alpha, y = value, color = Model)) + geom_boxplot(alpha = .5) + expand_limits(y = c(1,0)) + # geom_jitter(width = 0.2) + ggtitle('Concordance C-Index (per alpha parameter)')
show.params(params)
set.seed(1985) foldid <- loose.rock::balanced.cv.folds(ydata$status, nfolds = 10)$output # new.model <- my.cv.glmnet(.2, xdata, ydata, penalty.factor.degree.new, parallel = params$n.cores, seed = 1985) new.model2 <- my.cv.glmnet(.2, xdata, ydata, penalty.factor.degree.new, parallel = params$n.cores, seed = 1985)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.