# Libraries

library(futile.logger)
library(parallel)
library(glmnet)
library(loose.rock)
library(digest)
library(ggplot2)
library(survminer)
library(reshape2)
library(survival)
library(brca.data)
library(Vennerable)
library(limma)
library(tidyverse)
library(forcats)
library(survival.owl)
library(biclust)
#
library(glmSparseNet)

require(doMC)
registerDoMC(cores=params$n.cores)

#devtools::install_github('averissimo/glmnet')
#
devtools::load_all()
#
.Last.value <- base.dir(path = '/ssd_home/averissimo/work/rpackages/network.cox-cache')
.Last.value <- show.message(FALSE)
.Last.value <- flog.layout(layout.format('[~l ~t] ~m'))
.Last.value <- flog.appender(appender.tee(file.path('logs', sprintf('logger-optimal-%s.txt', format(Sys.time(), '%Y%m%d-%Hh%Mm%S')))))
theme_set(theme_minimal())

Load and normalize data

Load TCGA data using prepare.tcga.survival.data.old function

cat("```\n")
cat("# Number of resampling folds: ", params$seed.additional, '\n', sep = '')
cat('prepare.tcga.survival.data(\'', params$project, '\',\n', sep = '')
cat('                           \'', params$tissue, '\',\n', sep = '')
cat('                            normalization      = \'', params$normalize, '\',\n', sep = '')
cat('                            log2.pre.normalize = ', params$log2, ',\n', sep = '')
cat('                            handle.duplicates  = \'', params$handle.duplicates, '\',\n', sep = '')
cat('                            coding.genes       = ', params$coding.genes, ')\n', sep = '')
cat("```\n")
my.data <- run.cache(prepare.tcga.survival.data, 
                     params$project, 
                     params$tissue, 
                     #
                     #input.type                = 'rna',
                     normalization             = params$normalize,
                     log2.pre.normalize        = params$log2,
                     #
                     handle.duplicates         = params$handle.duplicates, 
                     coding.genes              = params$coding.genes,
                     subtract.surv.column      = params$subtract.surv.column,
                     cache.prefix              = 'tcga-data.new',
                     show.message = T)

#
clinical <- my.data$clinical
#
xdata     <- my.data$xdata
ydata     <- my.data$ydata
xdata.raw <- my.data$xdata.raw
#
#
xdata.digest.cache     <- my.data$xdata.digest
xdata.raw.digest.cache <- my.data$xdata.raw.digest
ydata.digest.cache     <- my.data$ydata.digest
#
rm(my.data)

Survival plot

ggsurv <- survfit(Surv(time, status) ~ 1, data = ydata %>% mutate(time = time/365*12)) %>% 
  ggsurvplot(data = ydata,
             conf.int = FALSE,
             risk.table = TRUE,
             cumcensor = TRUE,
             xlab = "Time in months",
             tables.height = .1,
             surv.median.line = "hv",  # add the median survival pointer.
             break.x.by = 12,
             #ncensor.plot = TRUE,
             ggtheme = theme_minimal())
#
ggsurv$plot <- ggsurv$plot + theme(legend.position = 'none') #+ scale_x_continuous('x', breaks = seq(0, max((ydata %>% mutate(time = time/365*12))$time) + 20, 20))
ggsurv$ncensor.plot <- ggpar(ggsurv$ncensor.plot, font.y = c(0, "bold.italic", "darkgreen"))
ggsurv$table <- ggpar(ggsurv$table, font.y = c(0, "bold.italic", "darkgreen"))
ggsurv

Load degree data

Using r params$degree.type network

if (params$degree.type == 'oldie') {
  #
  # Load old matrix used in SPARSA conference
  load(sprintf('/home/averissimo/work/rpackages/brca.analysis/data/degree-%.6f.RData', params$degree.cutoff))

} else if (params$degree.type == 'correlation') {
  #
  # Load degree of network calculated summing the inverse of each weight
  degree <- degree.cor(xdata.raw, 
                       consider.unweighted = params$degree.unweighted, 
                       cutoff              = params$degree.cutoff, 
                       method              = params$degree.correlation,
                       n.cores = params$n.cores)
} else if (params$degree.type == 'covariance') {
  #
  # Load degree of network calculated using covariance
  degree <- degree.cov(xdata.raw, 
                       consider.unweighted = params$degree.unweighted, 
                       cutoff              = params$degree.cutoff, 
                       method              = params$degree.correlation,
                       n.cores = params$n.cores)
} else if (params$degree.type == 'sparsebn') {
  #
  # Load degree of network from sparsebn package (calulate bayesian network)
  degree <- degree.sparsebn(xdata.raw,
                            cutoff              = params$degree.cutoff, 
                            consider.unweighted = params$degree.unweighted,
                            n.cores = params$n.cores)
} else if (params$degree.type == 'string') {
  #
  # Load degree of STRING network downloaded from external sources
  degree <- run.cache(stringDBhomoSapiens, cache.prefix = "stringDB") %>% {
    run.cache(buildStringNetwork, ., use.names = 'ensembl', 
              cache.prefix = 'string-network')
    } %>% {
      .@x[.@x <= params$degree.cutoff] <- 0
      if (params$degree.unweighted) {
        .@x <- (.@x != 0) * 1
      }
      Matrix::colSums(.) + Matrix::rowSums(.)
    }

  #
  # Adds missing genes as nodes with 0 degree

  degree <- colnames(xdata.raw)[!colnames(xdata.raw) %in% names(degree)] %>% {
   c(degree, array(0, length(.), dimnames = list(.))) 
  } %>% { .[colnames(xdata.raw)] }
} else if (params$degree.type == 'string.pedro') {
  #
  # Load degree of STRING network downloaded from external sources
  load('/ssd_home/averissimo/work/rpackages/network.cox-big.files/saves/degree-string.RData')
  ix.names <- colnames(xdata) %in% names(degree_uw)
  degree <- degree_uw[colnames(xdata)]
}

Penalty

Preparing degree vector

see ?glmSparseNet::heuristic.scale or ?glmSparseNet::hub.heuristic

# see ?glmSparseNet::heuristic.scale
trans.fun <- function(x) { heuristicScale(x) + 0.2}
original.penalty.factor        <- degree
names(original.penalty.factor) <- colnames(xdata)

##########################
#                        #
#   SUPER IMPORTANT!!!!  #
#                        #
##########################
original.penalty.factor[is.na(original.penalty.factor)] <- 0
norm.orig.penalty.factor <- original.penalty.factor / max(original.penalty.factor[!is.na(original.penalty.factor)])

#
#
# DegreeCox (old and log(old))
#
penalty.factor.degree.log <- penalty.factor.degree.old <- 1 / norm.orig.penalty.factor

inf.ix <- is.infinite(penalty.factor.degree.old) # index for infinite values

# log(old)
log.penal                          <- log(penalty.factor.degree.old[!inf.ix]) + 1
penalty.factor.degree.log[!inf.ix] <- log.penal
penalty.factor.degree.log[inf.ix]  <- max(log.penal) + 1

# old
non.log                           <- penalty.factor.degree.old[!inf.ix]
penalty.factor.degree.old[inf.ix] <- max(non.log) + 1

#
# DegreeCox heuristic
#
penalty.factor.degree.new <-trans.fun(1 - norm.orig.penalty.factor)

#
# OrphanCox
#
penalty.factor.orphan <- trans.fun(norm.orig.penalty.factor)

Cross-validation

Testing with cross-validation to find optimal number of variables for comparisson models.

alpha.models <- tibble(alpha = numeric(), seed = numeric(), glmnet = list(), hub = list(), orphan = list())

for (ix.seed in seq(params$seed, params$seed + params$seed.additional)) {
  flog.info('Starting run with seed %d', ix.seed)
  for (ix.alpha in params$alpha) {
    #aux.call <- function(xdata, ydata, network, alpha, seed) { my.cv.glmnet(alpha, xdata, ydata, network, parallel = TRUE, seed = seed) }
    aux.call <- function(xdata, ydata, network, alpha, seed) { my.cv.glmnet(alpha, xdata, ydata, network, parallel = params$n.cores, seed = seed) }

    cv.classic <- run.cache(aux.call, xdata, ydata, rep(1, ncol(xdata)), ix.alpha, ix.seed, 
                            cache.prefix = 'cv.glmSparseNet.elastic.net.glmnet', cache.digest = list(xdata.digest.cache))
    flog.info('    - Finished elastic net (%d)', sum(cv.classic$coef != 0))

    cv.hub     <- run.cache(aux.call, xdata, ydata, penalty.factor.degree.new, ix.alpha, ix.seed, 
                            cache.prefix = 'cv.glmSparseNet.hub.glmnet', cache.digest = list(xdata.digest.cache))
    flog.info('    - Finished hub (%d)', sum(cv.hub$coef != 0))

    cv.orphan  <- run.cache(aux.call, xdata, ydata, penalty.factor.orphan, ix.alpha, ix.seed, 
                            cache.prefix = 'cv.glmSparseNet.orphan.glmnet', cache.digest = list(xdata.digest.cache))
    flog.info('    - Finished orphan (%d)', sum(cv.orphan$coef != 0))

    new.line <- tibble(alpha  = ix.alpha, 
                       seed   = ix.seed,
                       glmnet = list(cv.classic$model), 
                       hub    = list(cv.hub$model), 
                       orphan = list(cv.orphan$model))

    alpha.models <- rbind(alpha.models, new.line)

    save(new.line, file = sprintf('cv.value/cv.value_%s_xdata-%s_ydata-%s', loose.rock::digest.cache(new.line), xdata.digest.cache, loose.rock::digest.cache(ydata)))

    flog.info('  * Finished run with alpha %.3f', ix.alpha)
  }
}

Selected variables

sel.all <- apply(alpha.models[,-2], 1:2, function(ix.name) {
  if (is.numeric(ix.name[[1]])) {
    return(ix.name[[1]])
  } else {
    best.coef <- coef(ix.name[[1]], s = 'lambda.min')[,1]
    return(names(best.coef[best.coef != 0]))
  }
}) %>% as.data.frame %>% as.tbl

sel.all$alpha <- as.numeric(sel.all$alpha)

sel.all %>% as.data.frame %>% group_by(alpha) %>% summarise_all(funs(mean = mean(sapply(., length)),
                                                                      sd = sd(sapply(., length)),
                                                                      unique = length(unique(unlist(.))))) %>% knitr::kable()

Kaplan-Meier Results per alpha parameter

Individuals are separated by the coefficients estimated by the model in two groups with high and low risk of the event (death). The median is then used.

Summary table for each model type (mean is used)

km.all <- apply(alpha.models[,-2], 1:2, function(ix.name) {
  if (is.numeric(ix.name[[1]])) {
    return(ix.name[[1]])
  } else {
    best.coef <- coef(ix.name[[1]], s = 'lambda.min')[,1]
    if (sum(best.coef[best.coef != 0]) == 0) {
      return(NA)
    }
    return(separate2GroupsCox(best.coef, xdata, ydata)$pvalue)
  }
})

km.all %>% as.data.frame %>%
  group_by(alpha) %>% 
  summarise_all(funs(mean = mean(., na.rm = TRUE), 
                     SE = sd(., na.rm = TRUE))) %>% 
  knitr::kable()

Kaplan-Meier estimator plot

km.all %>% as.data.frame %>% 
  melt.data.frame(id.vars = 'alpha') %>% 
  mutate(Alpha = factor(alpha), Model = variable) %>%
  ggplot(aes(x = Alpha, y = value, color = Model)) + 
  geom_boxplot(alpha = .5) + 
  # geom_jitter(width = 0.2, size = .5) +
  # expand_limits(y = .0025) +
  ggtitle('Kaplan-Meier estimator (per alpha parameter)', 
          subtitle = 'Two groups (High/Low risk of event) are estimated based on Cox model\'s coefficients')

Concordance C-Index Results per alpha parameter

Concordance C-Index calculates a value between 0 and 1 that represents the pairwise concordance between the survival time and the relative-risk calculated from the Cox model. In other words, it assesses for each pair of individuals if individuals with lower risk survive longer than individuals with higher risks.

Summary of results

c.index.all <- apply(alpha.models[,-2], 1:2, function(ix.name) {
  if (is.numeric(ix.name[[1]])) {
    return(ix.name[[1]])
  } else {
    best.coef <- coef(ix.name[[1]], s = 'lambda.min')[,1]
    if (sum(best.coef[best.coef != 0]) == 0) {
      return(NA)
    }
    return(fit.risk(best.coef, xdata, ydata, NULL))
  }
})

c.index.all %>% as.data.frame %>%  group_by(alpha) %>% 
  summarise_all(funs(mean = mean(., na.rm = TRUE), 
                     SE   = sd(., na.rm = TRUE))) %>% 
  knitr::kable()

C-Index plot

c.index.all  %>% as.data.frame %>% 
  melt.data.frame(id.vars = 'alpha') %>% 
  mutate(Alpha = factor(alpha), Model = variable) %>%
  ggplot(aes(x = Alpha, y = value, color = Model)) + 
  geom_boxplot(alpha = .5) + 
  expand_limits(y = c(1,0)) +
  # geom_jitter(width = 0.2) +
  ggtitle('Concordance C-Index (per alpha parameter)')

Parameters

show.params(params)
set.seed(1985)
foldid    <- loose.rock::balanced.cv.folds(ydata$status, nfolds = 10)$output
#

new.model <- my.cv.glmnet(.2, xdata, ydata, penalty.factor.degree.new, parallel = params$n.cores, seed = 1985)
new.model2 <- my.cv.glmnet(.2, xdata, ydata, penalty.factor.degree.new, parallel = params$n.cores, seed = 1985) 


averissimo/glmSparseNetPaper documentation built on Jan. 25, 2021, 12:11 p.m.