In averissimo/glmSparseNetPaper:

Overview

Data

TCGA project: r params$project
Tissue type: r params$tissue
- r if(params$coding.genes) {'Only using coding genes'} else {'Using all genes (coding and non-coding)'}
Subtract survival time: r if(!is.null(params$subtract.surv.column)){params$subtract.surv.column} else{'Using survival time from diagnostics'}
Degree calculation: r params$degree.type

Normalization

LOG2 transform: r if (params$log2) { 'yes' } else { 'no' }
Gene by gene: r params$normalization

Model options

Train / Test: r params$train * 100%
Model (Alpha / Target variables): r paste(sapply(names(params$target.vars), function(ix) {sprintf('%s (%.2f / %d)', ix, params$target.vars[[ix]]$alpha, params$target.vars[[ix]]$vars)}), collapse = ', ')

Network processing

Degree calculation: r params$degree.type
Edge cutoff: r params$degree.cutoff
r if (params$degree.unweighted) { 'Unweighted' } else { 'Weighted' } edges

cor/cov method: r params$degree.correlation

only applies if network type is correlation/covariance

# ComBat(matrix and category_id)
# plotMDS(matrix and some color stuff)
# inSilicoDB is good to understand that and validate results
my.out.width <- if (knitr::is_latex_output()) { 
  '60%'
} else if (knitr::is_html_output()) { 
    '100%' 
} else {
  NULL
}

message_warning <- if (knitr::is_latex_output() || knitr::is_html_output()) {
  FALSE
} else {
  TRUE
}

knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, tidy = TRUE, out.width = my.out.width, warning=message_warning, message=message_warning)

# Libraries
library(futile.logger)
library(parallel)
library(glmnet)
library(survminer)
library(loose.rock)
library(digest)
library(ggplot2)
library(reshape2)
library(survival)
library(Vennerable)
library(limma)
library(tidyverse)
library(forcats)
library(biclust)
#
library(glmSparseNet)

# In case classic glmnet is used
library(doMC)
registerDoMC(cores=params$n.cores)
globalenv() %>% {.$global.n.cores <- params$n.cores }

#
devtools::load_all()
#
.Last.value <- base.dir(path = '/ssd_home/averissimo/work/rpackages/network.cox-cache')
.Last.value <- show.message(FALSE)
.Last.value <- flog.layout(layout.format('[~l] ~m'))
.Last.value <- flog.appender(appender.tee(file.path('logs', sprintf('logger-analysis-%s.txt', format(Sys.time(), '%Y%m%d-%Hh%Mm%S')))))
theme_set(theme_minimal())

Load and normalize data

Data

TCGA project: r params$project
Tissue type: r params$tissue
- r if(params$coding.genes) {'Only using coding genes'} else {'Using all genes (coding and non-coding)'}
Degree calculation: r params$degree.type
Subtract survival time: r params$subtract.surv.column

Normalization

LOG2 transform: r if (params$log2) { 'yes' } else { 'no' }
Gene by gene: r params$normalization

Load TCGA data

See prepare.tcga.survival.data function for how to configure parameters.

This uses data packages from obtained from 'github:averissimo/tcga' that contain FPKM expression levels.

cat("```\n")
cat('prepare.tcga.survival.data(\'', params$project, '\',\n', sep = '')
cat('                           \'', params$tissue, '\',\n', sep = '')
cat('                            normalization        = \'', params$normalization, '\',\n', sep = '')
cat('                            log2.pre.normalize   = ', params$log2, ',\n', sep = '')
cat('                            handle.duplicates    = \'', params$handle.duplicates, '\',\n', sep = '')
cat('                            coding.genes         = ', params$coding.genes, ',\n', sep = '')
cat('                            subtract.surv.column = \'', params$subtract.surv.column, '\')\n', sep = '')
cat("```\n")

my.data <- run.cache(prepare.tcga.survival.data, 
                     params$project, 
                     params$tissue, 
                     #
                     #input.type                = 'rna',
                     normalization             = params$normalization,
                     log2.pre.normalize        = params$log2,
                     #
                     handle.duplicates         = 'keep_first', 
                     coding.genes              = TRUE,
                     subtract.surv.column      = params$subtract.surv.column,
                     cache.prefix              = 'tcga-data.new',
                     show.message = TRUE)
#
clinical <- my.data$clinical
#
xdata     <- my.data$xdata
ydata     <- my.data$ydata
xdata.raw <- my.data$xdata.raw
#
#
xdata.digest.cache     <- my.data$xdata.digest
xdata.raw.digest.cache <- my.data$xdata.raw.digest
ydata.digest.cache     <- my.data$ydata.digest
#
rm(my.data)

Summary of data

flog.info('       Loaded data from TCGA: %s', params$project)
flog.info('              type of tissue: %s', params$tissue)
flog.info('  observations (individuals): %d (%d event / %d censored)', nrow(xdata), sum(ydata$status), sum(!ydata$status))
flog.info('           variables (genes): %d', ncol(xdata))

Survival curve

ggsurv <- survfit(Surv(time, status) ~ 1, data = ydata %>% mutate(time = time/365*12)) %>% 
  ggsurvplot(data = ydata,
             conf.int = FALSE,
             risk.table = TRUE,
             cumcensor = TRUE,
             xlab = "Time in months",
             tables.height = .1,
             surv.median.line = "hv",  # add the median survival pointer.
             break.x.by = 12,
             #ncensor.plot = TRUE,
             ggtheme = theme_minimal())
#
ggsurv$plot <- ggsurv$plot + theme(legend.position = 'none') #+ scale_x_continuous('x', breaks = seq(0, max((ydata %>% mutate(time = time/365*12))$time) + 20, 20))
ggsurv$ncensor.plot <- ggpar(ggsurv$ncensor.plot, font.y = c(0, "bold.italic", "darkgreen"))
ggsurv$table <- ggpar(ggsurv$table, font.y = c(0, "bold.italic", "darkgreen"))
ggsurv

# Test physiological Variables

# *note:* Plots with p-value `<= 0.05` are shown. Two types of p-values are used as criteria:

# 1. Univariate cox model
# 1. Log rank test when separating between high and low risk group

all.plots <- run.cache(test.all.clinical, clinical, ydata, cache.prefix = 'allplots')

multiplot(plotlist = all.plots$plot.list, layout = all.plots$layout)
knitr::kable(dplyr::arrange(dplyr::select(all.plots$df, name, p.value, desc), p.value))

Load degree data

Network processing

Degree calculation: r params$degree.type
Edge cutoff: r params$degree.cutoff
r if (params$degree.unweighted) { 'Unweighted' } else { 'Weighted' } edges

cor/cov method: r params$degree.correlation

only applies if network type is correlation/covariance

if (params$degree.type == 'oldie') {
  #
  # Load old matrix used in SPARSA conference
  load(sprintf('/home/averissimo/work/rpackages/brca.analysis/data/degree-%.6f.RData', params$degree.cutoff))

} else if (params$degree.type == 'correlation') {
  #
  # Load degree of network calculated summing the inverse of each weight
  degree <- degree.cor(xdata.raw, 
                       consider.unweighted = params$degree.unweighted, 
                       cutoff              = params$degree.cutoff, 
                       method              = params$degree.correlation,
                       n.cores = params$n.cores)
} else if (params$degree.type == 'covariance') {
  #
  # Load degree of network calculated using covariance
  degree <- degree.cov(xdata.raw, 
                       consider.unweighted = params$degree.unweighted, 
                       cutoff              = params$degree.cutoff, 
                       method              = params$degree.correlation,
                       n.cores = params$n.cores)
} else if (params$degree.type == 'sparsebn') {
  #
  # Load degree of network from sparsebn package (calulate bayesian network)
  degree <- degree.sparsebn(xdata.raw,
                            cutoff              = params$degree.cutoff, 
                            consider.unweighted = params$degree.unweighted,
                            n.cores = params$n.cores)
} else if (params$degree.type == 'string') {
  #
  # Load degree of STRING network downloaded from external sources
  degree <- run.cache(stringDBhomoSapiens, cache.prefix = "stringDB") %>% {
    run.cache(buildStringNetwork, ., use.names = 'ensembl', 
              cache.prefix = 'string-network')
    } %>% {
      .@x[.@x <= params$degree.cutoff] <- 0
      if (params$degree.unweighted) {
        .@x <- (.@x != 0) * 1
      }
      Matrix::colSums(.) + Matrix::rowSums(.)
    }

  #
  # Adds missing genes as nodes with 0 degree

  degree <- colnames(xdata.raw)[!colnames(xdata.raw) %in% names(degree)] %>% {
   c(degree, array(0, length(.), dimnames = list(.))) 
  } %>% { .[colnames(xdata.raw)] }
} else if (params$degree.type == 'string.pedro') {
  #
  # Load degree of STRING network downloaded from external sources
  load('/ssd_home/averissimo/work/rpackages/network.cox-big.files/saves/degree-string.RData')
  ix.names <- colnames(xdata) %in% names(degree_uw)
  degree <- degree_uw[colnames(xdata)]
}
rm(xdata.raw)
.Last.value <- gc(reset = TRUE, verbose = FALSE)

Preparing degree vector

Normalize degree between 0 and 1
Hub: heuristic( 1 - degree )
Orphan: heuristic( degree )
trans.fun is a double power to scale the values

see ?glmSparseNet::heuristicScale or ?glmSparseNet::hubHeuristic

# see ?glmSparseNet::heuristic.scale
trans.fun <- function(x) { heuristicScale(x) + 0.2}

original.penalty.factor        <- degree
names(original.penalty.factor) <- colnames(xdata)

##########################
#                        #
#   SUPER IMPORTANT!!!!  #
#                        #
##########################
original.penalty.factor[is.na(original.penalty.factor)] <- 0
norm.orig.penalty.factor <- original.penalty.factor / max(original.penalty.factor[!is.na(original.penalty.factor)], 
                                                          na.rm = TRUE)
#
#
# DegreeCox (old and log(old))
#
penalty.factor.degree.log <- penalty.factor.degree.old <- 1 / norm.orig.penalty.factor

inf.ix <- is.infinite(penalty.factor.degree.old) # index for infinite values

# log(old)
log.penal                          <- log(penalty.factor.degree.old[!inf.ix]) + 1
penalty.factor.degree.log[!inf.ix] <- log.penal
penalty.factor.degree.log[inf.ix]  <- max(log.penal) + 1

# old
non.log                           <- penalty.factor.degree.old[!inf.ix]
penalty.factor.degree.old[inf.ix] <- max(non.log) + 1

#
# DegreeCox heuristic
#

tmp.degree_new                        <- norm.orig.penalty.factor
tmp.degree_new[is.na(tmp.degree_new)] <- 0
penalty.factor.degree.new             <-trans.fun(1 - tmp.degree_new)

#
# OrphanCox
#

tmp.orphan                    <- norm.orig.penalty.factor
tmp.orphan[is.na(tmp.orphan)] <- 1
penalty.factor.orphan         <- trans.fun(tmp.orphan)

my.df <- data.frame(ix                        = seq_along(penalty.factor.degree.old), 
                    penalty.factor.degree.old = penalty.factor.degree.old, 
                    penalty.factor.degree.new = penalty.factor.degree.new,
                    penalty.factor.degree.log = penalty.factor.degree.log,
                    penalty.factor.orphan     = penalty.factor.orphan,
                    original                  = original.penalty.factor)

Genes with degree above 4000

top4000 <- data.frame(ensembl_gene_id = names(original.penalty.factor), degree = original.penalty.factor, stringsAsFactors = FALSE) %>%
  arrange(desc(degree)) %>%
  filter(degree > 4000)

top4000.names <- geneNames(top4000$ensembl_gene_id)

top4000 %>% 
  inner_join(top4000.names, by='ensembl_gene_id') %>% 
  knitr::kable()

Original degree frequency {.tabset .tabset-fade .tabset-pills}

Original {-}

transf.d <- reshape2::melt(my.df[,c('ix', 
                 'original', 
                 'penalty.factor.degree.old',
                 'penalty.factor.degree.log',
                 'penalty.factor.degree.new',
                 'penalty.factor.orphan')] %>% as_tibble, id.vars = c('ix'), variable.name = 'Type')

levels(transf.d$Type) <- levels(transf.d$Type) %>%
  gsub('original', 'Original', .) %>%
  gsub('penalty.factor.degree.old', 'Degree (old)', .) %>%
  gsub('penalty.factor.degree.log', 'Degree (log(old))', .) %>%
  gsub('penalty.factor.degree.new', 'Degree (heuristic)', .) %>%
  gsub('penalty.factor.orphan', 'Orphan', .)

if (!params$calc.params.old) {
  transf.d <- transf.d %>% 
    filter(Type != 'Degree (old)') %>%
    filter(Type != 'Degree (log(old))')
}

#degree.factor.freq.plot <- 
ggplot(transf.d) +
  geom_freqpoly(aes(value, color = Type), alpha = 0.8, bins = 200, size = 1.5) +
  theme_minimal() + theme(legend.position = 'none') +
  facet_wrap( ~ Type, ncol = 1, scale = 'free_x') +
  scale_y_continuous(trans = 'log10') +
  xlab('Penalty') +
  ylab('Frequency count (log10 scale)') + 
  ggtitle('Non scale on X')

LOG10 Scale on X-axis {-}

transf.d <- reshape2::melt(my.df[,c('ix', 
                                     'original', 
                                     'penalty.factor.degree.old',
                                     'penalty.factor.degree.log',
                                     'penalty.factor.degree.new',
                                     'penalty.factor.orphan')], id.vars = c('ix'), variable.name = 'Type')

levels(transf.d$Type) <- levels(transf.d$Type) %>%
  gsub('original', 'Original', .) %>%
  gsub('penalty.factor.degree.old', 'Degree (old)', .) %>%
  gsub('penalty.factor.degree.log', 'Degree (log(old))', .) %>%
  gsub('penalty.factor.degree.new', 'Degree (heuristic)', .) %>%
  gsub('penalty.factor.orphan', 'Orphan', .)

if (!params$calc.params.old) {
  transf.d <- transf.d %>% 
    filter(Type != 'Degree (old)') %>%
    filter(Type != 'Degree (log(old))')
}

ggplot(transf.d) +
  geom_freqpoly(aes(value, color = Type), alpha = 0.8, bins = 200, size = 1.5) +
  theme_minimal() + theme(legend.position = 'none') +
  facet_wrap( ~ Type, ncol = 1, scale = 'free_x') +
  scale_y_continuous(trans = 'log10') +
  scale_x_continuous(trans = 'log10') +
  xlab('Penalty (log10 scale)') +
  ylab('Frequency count (log10 scale)') + 
  ggtitle('Log scale on X')

Model Inference

calc.params <- params[c('subset', 'train', 'calc.params.old', 'seed', 
                        'target.vars', 'balanced.sets')]
model.results <- run.cache(call.results,
                           xdata,
                           ydata,
                           penalty.factor.degree.new,
                           penalty.factor.degree.old,
                           penalty.factor.orphan,
                           calc.params,
                           no.plot = TRUE,
                           no.models = TRUE,
                           cache.digest = list(xdata.digest.cache, 
                                               ydata.digest.cache),
                           cache.prefix = 'big-diff')
#
xdata.ix      <- model.results$xdata.ix
varnames      <- model.results$varnames
coefs         <- model.results$coefs
result        <- model.results$result
lambdas       <- model.results$lambdas
km.train      <- model.results$metric$km.train
km.test       <- model.results$metric$km.test
c.index.train <- model.results$metrics$c.index.train
c.index.test  <- model.results$metrics$c.index.test

Train and test sets

xdata.test <- xdata[model.results$ixs$test,]
ydata.test <- ydata[model.results$ixs$test,]
#
xdata.train <- xdata[model.results$ixs$train,]
ydata.train <- ydata[model.results$ixs$train,]


flog.info('Size of sets: (size/events)\n * Train: %.2f%% :: %4d / %4d\n *  Test: %.2f%% :: %4d /%4d', 
          params$train * 100,
          nrow(xdata.train), 
          sum(ydata.train$status), 
          (1 - params$train) * 100,
          nrow(xdata.test),
          sum(ydata.test$status))

flog.info('Number of variables per model:')
lapply(model.results$coefs, function(ix){ sum(ix != 0) }) %>%
  melt() %>%
  dplyr::rename(nvars = value,
         Model = L1) %>%
  mutate(BaseModel = prettify.labels(Model, 'base.model'),
         TargetVars = prettify.labels(Model, 'target.vars'),
         Alpha = prettify.labels(Model, 'alpha'),
         Model = prettify.labels(Model, 'model')) %>%
  dplyr::select(Model, BaseModel, Alpha, TargetVars, nvars) %>%
  knitr::kable()

flog.info('note, selected variables could be slightly different from target, to have more accuracy increase nlambda in code')

Results

Relative risk distribution {.tabset .tabset-fade .tabset-pills}

Calculated using the inferred models and the train/test datasets. The higher the better.

this.list <- list()
for (ix.name in names(model.results$coefs)) {
  this.list[[ix.name]] <- list(model = model.results$coefs[[ix.name]], target = lambdas[[ix.name]])
}

fit.df <- run.cache(build.fit.df, this.list, xdata.train, xdata.test, xdata.ix, 
                    cache.prefix = 'build.fit.df')

Test Set {-}

show.relative.risk.distribution(fit.df, 'Test')

Train Set {-}

show.relative.risk.distribution(fit.df, 'Train')

Kaplan-Meier Curves

my.km.train <- list()
my.km.test <- list()
ix <- 1
for (ix.name in names(model.results$coefs)) {
  nvars <- sum(0 != model.results$coefs[[ix.name]])

  ix.alpha <- prettify.labels(ix.name, 'alpha') %>% as.double

  my.km.train[[ix]] <- my.draw.kaplan(list(ix.name = coefs[[ix.name]]), 
                                      xdata.train[, xdata.ix],  ydata.train, 
                                      plot.title = 'Train set')$plot + 
    km.name('Train', ix.alpha, ix.name, nvars)
  my.km.train[[ix]]$base.model <- ix.name %>% prettify.labels(ret.value = 'base.model')
  my.km.train[[ix]]$model <- ix.name %>% prettify.labels(ret.value = 'model')

  my.km.test[[ix]]  <- my.draw.kaplan(list(ix.name = coefs[[ix.name]]), 
                                      xdata.test[, xdata.ix], ydata.test, 
                                      plot.title = 'Test set')$plot + 
    km.name('Test', ix.alpha, ix.name, nvars)
  my.km.test[[ix]]$base.model <- ix.name %>% prettify.labels(ret.value = 'base.model')
  my.km.test[[ix]]$model <- ix.name %>% prettify.labels(ret.value = 'model')
  ix <- ix + 1
}

Models on test sets {.tabset .tabset-fade .tabset-pills}

cat('\n\n')
for(ix in seq_along(my.km.train)) {
  cat('#### ', my.km.test[[ix]]$model, ' (base model: ', my.km.test[[ix]]$base.model, ')  {- .tabset .tabset-fade .tabset-pills}', sep = '')
  cat('\n\n')
  cat('##### Test set {-}')
  cat('\n\n')
  print(my.km.test[[ix]]$plot)
  cat('\n\n')
  cat('##### Training set {-}')
  cat('\n\n')
  print(my.km.train[[ix]]$plot)
  cat('\n\n')
}

Summary Table

table.data <- data.frame()
for (ix.name in names(model.results$coefs)) {
  table.data <- rbind(table.data, data.frame(
                           weighted      = !params$degree.unweighted,
                           penalization  = params$degree.type,
                           project       = params$project,
                           tissue        = params$tissue,
                           cutoff        = params$degree.cutoff,
                           coding.genes  = params$coding.genes,
                           alpha         = prettify.labels(ix.name, 'alpha'),
                           model         = prettify.labels(ix.name, 'model'),
                           nvars         = sum(model.results$coefs[[ix.name]] != 0),
                           km.train      = km.train[[ix.name]],
                           km.test       = km.test[[ix.name]],
                           c.index.train = c.index.train[[ix.name]],
                           c.index.test  = c.index.test[[ix.name]]))
}
knitr::kable(table.data)

Non-zero genes

coefs.v <- list()
non.zero.df.result <- NULL
for (ix.name in names(model.results$coefs)){
  # flog.info('Working on %s', ix.name)
  coefs.v[[ix.name]] <- model.results$coefs[[ix.name]]
  non.zero.ix        <- which(coefs.v[[ix.name]] != 0)
  #
  names.tmp <- varnames[non.zero.ix]
  coef.tmp  <- coefs.v[[ix.name]][non.zero.ix]
  if (length(coefs.v) == 1) {
    # flog.info('  * Len == 0')
    non.zero.df.result <- data.frame(gene.id = names.tmp, coef = coef.tmp)
    first.name <- ix.name
  } else {
    # flog.info('  * Len > 0')
    non.zero.df2 <- data.frame(gene.id = names.tmp, coef = coef.tmp)
    suffix <- c('', sprintf('.%s', ix.name))
    non.zero.df.result  <- dplyr::full_join(non.zero.df.result, non.zero.df2, by = c('gene.id'), suffix = suffix)
  }
}
colnames(non.zero.df.result)[2] <- sprintf('coef.%s', first.name)
#
non.zero.df.out <- cbind(non.zero.df.result, degree = as.vector(original.penalty.factor[non.zero.df.result$gene.id]))
gene.ids <- non.zero.df.out$gene.id

gene.names <- geneNames(gene.ids)
new.t      <- non.zero.df.out
ixs        <- gene.names[unlist(sapply(new.t$gene.id, function(s) { which(gene.names$ensembl_gene_id == s) })),]
new.t$gene.id[new.t$gene.id %in% ixs$ensembl_gene_id] <- ixs$external_gene_name

gene.ids                  <- non.zero.df.out$gene.id
non.zero.df.out$gene.id   <- gsub('ENSG0*', 'ENSG', non.zero.df.out$gene.id)
non.zero.df.out$gene.name <- new.t$gene.id
col.ix <- colnames(non.zero.df.out) %in% c('gene.id', 'gene.name', 'degree')
non.zero.df.out <- non.zero.df.out[,c(which(col.ix), which(!col.ix))]

Venn Diagram of selected genes

Overlap between models (with same target number of variables) {.tabset .tabset-fade .tabset-pills}

all.names  <- names(model.results$coefs)
all.names  <- all.names[grep('^(degree_new--)|^(orphan--)|^(glmnet--)', all.names)]
all.model  <- unique(gsub('^([a-zAZ]+)--(.+)$', '\\1', gsub('_new', '', all.names)))
all.origin <- gsub('^([a-zAZ]+)--(.+)$', '\\2', gsub('_new', '', all.names))

cat('\n\n')
for (type.ix in seq_along(unique(all.origin))) {
  type <- unique(all.origin)[type.ix]
  cat('\n\n')
  cat('##### Base model:', prettify.labels(type, 'base.model'), 'with Target Vars:', prettify.labels(type, 'alpha'), 'and Alpha:', prettify.labels(type, 'target.vars'), ' {-}')
  cat('\n\n')
  ll <- list()
  for (ix.name in all.names[which(grepl(type, all.names))]) {
     label <- sprintf('%s (%d)', prettify.labels(ix.name, 'model'), sum(!is.na(non.zero.df.out[, sprintf('coef.%s', ix.name)])))
     ll[[label]] <- sort(non.zero.df.out$gene.id[!is.na(non.zero.df.out[, sprintf('coef.%s', ix.name)])])
  }
  tryCatch({
    draw.3venn(ll, paste0(prettify.labels(type, 'base.model'), ' base model with Target Vars: ', prettify.labels(type, 'alpha'), ' and Alpha: ', prettify.labels(type, 'target.vars')))
    #vv <- Venn(ll)
    #gridExtra::grid.arrange(grid::grid.grabExpr(plot(vv, doWeights = FALSE)), top= prettify.labels(type, 'string'))
  }, error = function(err) { flog.error('Probel with %s', err)})
  cat('\n\n')
}

Table of genes in models {-}

(not running at the moment)

knitr::kable(non.zero.df.out)

Hallmarks of Cancer links

my.hallmarks <- hallmarks(non.zero.df.out$gene.name)

df.no.hallmarks <- data.frame(gene.name = my.hallmarks$no.hallmakrs, stringsAsFactors = FALSE)

for (ix.name in names(model.results$coefs)) {
  df.no.hallmarks[, ix.name] <- (df.no.hallmarks$gene.name %in% non.zero.df.out[!is.na(non.zero.df.out[[paste0('coef.',ix.name)]]), 'gene.name']) * 1
}
melt(df.no.hallmarks, id.vars = 'gene.name') %>%
  mutate(variable = prettify.labels(variable, 'string.short')) %>%
  filter(value > 0) %>%
  ggplot(aes(gene.name,variable, fill=value)) + geom_raster() + 
    theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = 'none')

#*(not running at the moment)

df.scaled <- my.hallmarks$hallmarks
df.scaled <- data.frame(gene.name = rownames(df.scaled), df.scaled, stringsAsFactors = FALSE)

for (ix.name in names(model.results$coefs)) {
  df.scaled[, ix.name] <- (df.scaled$gene.name %in% non.zero.df.out[!is.na(non.zero.df.out[[paste0('coef.',ix.name)]]), 'gene.name']) * 1
}

for (ix.name in names(model.results$coefs)) {
  if (sum(df.scaled[[ix.name]] == 1) == 0) {
    flog.warn('No hallmark genes for: %s', ix.name)
    next
  }
  df.scaled.filter <- df.scaled[df.scaled[[ix.name]] == 1, !colnames(df.scaled) %in% names(model.results$coefs)]
  df.melt <- melt(df.scaled.filter, 
                  id.vars = c('gene.name')) %>% filter(value > 0)
  print(
    ggplot(df.melt, aes(gene.name,variable, fill=value)) + 
      geom_raster() +
      theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
      ggtitle(prettify.labels(ix.name, 'string.short')) +
      ggplot2::xlab('External Gene Name') + ggplot2::ylab('') +
        ggplot2::scale_fill_gradientn(colours = rev(grDevices::topo.colors(2)))
  )
}

`r params$ntimes` Runs

if (params$train >= 1) {
  flog.info("Train and test sets are the same, won't calculate %d times with random seeds", params$train)
  big.df <- data.frame()
} else {
  suppressWarnings(rm(xdata.train, xdata.test, ydata.train, ydata.test))

  new.params <- calc.params
  new.params$ntimes <- max(c(1, params$ntimes))

  result.env <- new.env()
  if (grepl('brca', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/1e53/cache-multiple.runs-H_1e53e21c7ba906c89169bacf3da11ae90369d3743c8ba78ec5867af3c1ebc882.RData', envir = result.env) # 960 - BRCA
  } else if (grepl('kirc', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/8e28/cache-multiple.runs-H_8e2877da3b6a9792325bf66799a6afe5610fec894bea56decd50beb30a8a165d.RData', envir = result.env) # 463 - KIRC
  } else if (grepl('', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/90ce/cache-multiple.runs-H_90ce3dd68b7db8d9b7bb4416c32f5d53b2ca8b89a85f4510b814ed61fe7f4ccd.RData', envir = result.env) # 445 - PRAD
  } else if (grepl('luad', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/aaf0/cache-multiple.runs-H_aaf0da66649359d48a743131b629fc2196ba00f5ee6f27888d31401631ea1b69.RData', envir = result.env) # 451 - LUAD
  } else if (grepl('ov', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/7324/cache-multiple.runs-H_73244f69140486d2e070daa1094a3645b1e41b5968de8c19400729e66ffe045b.RData', envir = result.env) # 370 - OV
  } else if (grepl('lgg', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/bf2f/cache-multiple.runs-H_bf2ffd249afb1ac28b7940e8922e4a15d804a2422798b1299da6cdfb7fd56940.RData', envir = result.env) # 448 - LGG
  } else if (grepl('skcm', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/cf52/cache-multiple.runs-H_cf52fc811e3de59c3a0db1e940807ca4b06e506bef317476f026e36e48cca234.RData', envir = result.env) # 351 - skcm
  } else if (grepl('ucec', params$project)) {
    load('/ssd_home/averissimo/work/rpackages/network.cox-cache/d9e4/cache-multiple.runs-H_d9e4cfbb3055b54907b8e2b0370adc5db7aaaf68137001899b31083a9ca16d92.RData', envir = result.env) # 517 - UCEC
  } else {
    result.env$result <- run.cache(build.multiple.runs,
                                   new.params, xdata, ydata,
                                   penalty.factor.degree.new, 
                                   penalty.factor.degree.old, 
                                   penalty.factor.orphan,
                                   xdata.digest.cache,
                                   ydata.digest.cache,
                                   #
                                   cache.prefix = 'multiple.runs')
  }
  ntimes.results.list <- result.env$result
  rm(result)

  ntimes.results <- ntimes.results.list$ntimes.results
  varnames       <- ntimes.results.list$varnames

  multiple.runs <- run.cache(handle.multiple,
                               ntimes.results,
                               cache.prefix = 'handle.multiple')

  big.df         <- multiple.runs$big.df
  rank.df        <- multiple.runs$rank.df
  values.df      <- multiple.runs$values.df
  rm(multiple.runs)
}

r if(nrow(big.df) == 0) {"<!--"}

C-Index distribution {.tabset .tabset-fade .tabset-pills}

Test set {- .tabset .tabset-fade .tabset-pills}

Summary of C-Index and Log-rank {-}

big.df %>% 
  dplyr::filter(grepl('Test set', metric)) %>% 
  dplyr::group_by(metric, model) %>% 
  dplyr::summarise(mean = round(mean(values),3), 
                   std = round(sd(values),3), 
                   median = round(median(values),3), 
                   min = format(min(values), digits = 3), 
                   .groups = 'keep') %>% 
  dplyr::mutate(base.model = prettify.labels(model, 'base.model'),
                target.vars = prettify.labels(model, 'target.vars'),
                alpha = prettify.labels(model, 'alpha'),
                model = prettify.labels(model, 'model')) %>% 
  dplyr::select(metric, model, base.model, target.vars, alpha, mean, std, median, min) %>% 
  knitr::kable()

C-Index Distribution {-}

big.df %>%
  mutate(base.model = prettify.labels(model, 'string.base'),
         string.model = prettify.labels(model, 'string.short'),
         model.name = prettify.labels(model, 'model')) %>%
  #filter(model %in% c('glmnet.classic.cv.28', 'degree_new.classic.cv.28', 'orphan.classic.cv.28')) %>%
  filter(metric %in% c('C-Index (Test set)')) %>%
  ggplot() +
    #geom_freqpoly(aes(values, color = model.name), alpha = .75, bins = 100) +
    #stat_density(aes(values, color = model.name),position="identity",geom="line", adjust = 1/5) +
    geom_density(aes(values, color = model.name, fill = model.name), alpha = .1, adjust = 1/5) + 
    facet_wrap( ~ base.model, ncol = 3) +
    ggtitle('Full distribution C-Index (Test set)') +
    theme_minimal() + theme(legend.position = 'bottom')

Rank {- .tabset .tabset-fade .tabset-pills}

filter.by <- 'C-Index (Test set)'
rank.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(base.model, X1, X2, X3) %>%
  reshape2::melt(id.vars = 'base.model') %>%
  { 
    ggplot(.) + 
      geom_histogram(aes(value, fill = variable), stat = 'count') +
      ggtitle(filter.by, subtitle = 'Ranks for each of models, X1 is best, Xn is the worst') +
      theme(legend.position = 'right') +
      facet_wrap( ~ base.model, ncol = 1)
  }

Elastic Net vs Hub {-}

filter.by <- 'C-Index (Test set)'
types     <- c('Elastic Net', 'Hub')

values.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(c('base.model', types)) %>%
  mutate(aa = UQ(as.name(types[1])),
         bb = UQ(as.name(types[2]))) %>%
  group_by(base.model) %>%
  dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>%
  knitr::kable()

values.df %>% 
  filter(type == filter.by) %>% {
    lims <- c(min(.[types]),
              max(.[types]))
    ggplot(.) + 
      geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), 
                 alpha = .3) +
      expand_limits(x = lims, y = lims) + 
      facet_wrap( ~ base.model, ncol = 3) + 
      theme(legend.position = 'none') + 
      coord_equal()
  }

Elastic Net vs. Orphan {-}

filter.by <- 'C-Index (Test set)'
types     <- c('Elastic Net', 'Orphan')

values.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(c('base.model', types)) %>%
  mutate(aa = UQ(as.name(types[1])),
         bb = UQ(as.name(types[2]))) %>%
  group_by(base.model) %>%
  dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>%
  knitr::kable()

values.df %>% 
  filter(type == filter.by) %>% {
    lims <- c(min(.[types]),
              max(.[types]))
    ggplot(.) + 
      geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), 
                 alpha = .3) +
      expand_limits(x = lims, y = lims) + 
      facet_wrap( ~ base.model, ncol = 3) + 
      theme(legend.position = 'none') + 
      coord_equal()
  }

Hub vs. Orphan {-}

filter.by <- 'C-Index (Test set)'
types     <- c('Orphan', 'Hub')

values.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(c('base.model', types)) %>%
  mutate(aa = UQ(as.name(types[1])),
         bb = UQ(as.name(types[2]))) %>%
  group_by(base.model) %>%
  dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>%
  knitr::kable()

values.df %>% 
  filter(type == filter.by) %>% {
    lims <- c(min(.[types]),
              max(.[types]))
    ggplot(.) + 
      geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), 
                 alpha = .3) +
      expand_limits(x = lims, y = lims) + 
      facet_wrap( ~ base.model, ncol = 3) + 
      theme(legend.position = 'none') + 
      coord_equal()
  }

Train set {- .tabset .tabset-fade .tabset-pills}

Summary of C-Index and Log-rank {-}

big.df %>% 
  dplyr::filter(grepl('Train set', metric)) %>% 
  dplyr::group_by(metric, model) %>% 
  dplyr::summarise(mean = round(mean(values),3), 
                   std = round(sd(values),3), 
                   median = round(median(values),3), 
                   min = format(min(values), digits = 3), 
                   .groups = 'keep') %>% 
  dplyr::mutate(base.model = prettify.labels(model, 'base.model'),
                target.vars = prettify.labels(model, 'target.vars'),
                alpha = prettify.labels(model, 'alpha'),
                model = prettify.labels(model, 'model')) %>% 
  dplyr::select(metric, model, base.model, target.vars, alpha, mean, std, median, min) %>% 
  knitr::kable()

C-Index Distribution {-}

big.df %>%
  mutate(base.model = prettify.labels(model, 'string.base'),
         string.model = prettify.labels(model, 'string.short'),
         model.name = prettify.labels(model, 'model')) %>%
  filter(metric %in% c('C-Index (Train set)')) %>%
  ggplot() +
    #geom_freqpoly(aes(values, color = model.name), alpha = .75, bins = 100) +
    #stat_density(aes(values, color = model.name),position="identity",geom="line", adjust = 1/5) +
    geom_density(aes(values, color = model.name, fill = model.name), alpha = .1, adjust = 1/5) + 
    facet_wrap( ~ base.model, ncol = 3) +
    ggtitle('Full distribution C-Index (Train set)') +
    theme_minimal() + theme(legend.position = 'bottom', strip.text = element_text(size=7))

C-Index Rank {-}

filter.by <- 'C-Index (Train set)'
rank.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(base.model, X1, X2, X3) %>%
  reshape2::melt(id.vars = 'base.model') %>%
  { 
    ggplot(.) + 
      geom_histogram(aes(value, fill = variable), stat = 'count') +
      ggtitle(filter.by, subtitle = 'Ranks for each of models, X1 is best, Xn is the worst') +
      theme(legend.position = 'right') +
      facet_wrap( ~ base.model, ncol = 1)
  }

Log-rank test on Kaplan-Meier models

Log-Rank distribution {.tabset .tabset-fade .tabset-pills}

Cumulative {-}

Distribution of Log-rank test with groups separated by high and low risk groups

big.df %>%
    mutate(base.model = prettify.labels(model, 'string.base'),
           string.model = prettify.labels(model, 'string.short'),
           model.name = prettify.labels(model, 'model')) %>%
    filter(metric %in% c('Log-rank (Test set)')) %>%
    ggplot() +
      geom_vline(xintercept = .05, linetype = 2,  alpha = .2) +
      annotate("text", x = 0.05, y = Inf, label = ' 0.05', alpha = .2, hjust = 0, vjust = 2) +
      stat_ecdf(aes(values, color = model.name)) + 
      facet_wrap( ~ base.model, ncol = 1) +
      ggtitle('Cumulative distribution Log-rank') +
      theme_minimal() + theme(legend.position = 'bottom')

Distribution {-}

Distribution of Log-rank test with groups separated by high and low risk groups

big.df %>%
  mutate(base.model = prettify.labels(model, 'string.base'),
         string.model = prettify.labels(model, 'string.short'),
         model.name = prettify.labels(model, 'model')) %>%
  filter(metric %in% c('Log-rank (Test set)')) %>%
  ggplot() +
    geom_vline(xintercept = .05, linetype = 2,  alpha = .2) +
    annotate("text", x = 0.05, y = Inf, label = ' 0.05', alpha = .2, hjust = 0, vjust = 2) +
    #geom_freqpoly(aes(values, color = model.name), alpha = .75, bins = 100) +
    #stat_density(aes(values, color = model.name),position="identity",geom="line", adjust = 1/5) +
    geom_density(aes(values, color = model.name, fill = model.name), alpha = .1, adjust = 1/5) + 
    facet_wrap( ~ base.model, ncol = 3) +
    ggtitle('Full distribution Log-rank') +
    theme_minimal() + theme(legend.position = 'bottom', strip.text = element_text(size=7))

Log-Rank Rank {.tabset .tabset-fade .tabset-pills}

filter.by <- 'Log-rank (Test set)'
rank.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(base.model, X1, X2, X3) %>%
  reshape2::melt(id.vars = 'base.model') %>%
  { 
    ggplot(.) + 
      geom_histogram(aes(value, fill = variable), stat = 'count') +
      ggtitle(filter.by, subtitle = 'Ranks for each of models, X1 is best, Xn is the worst') +
      theme(legend.position = 'right') +
      facet_wrap( ~ base.model, ncol = 1)
  }

Elastic Net vs Hub

filter.by <- 'Log-rank (Test set)'
types     <- c('Elastic Net', 'Hub')

values.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(c('base.model', types)) %>%
  mutate(aa = UQ(as.name(types[1])),
         bb = UQ(as.name(types[2]))) %>%
  group_by(base.model) %>%
  dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>%
  knitr::kable()

values.df %>% 
  filter(type == filter.by) %>% {
    lims <- c(min(.[types]),
              max(.[types]))
    ggplot(.) + 
      geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), 
                 alpha = .3) +
      expand_limits(x = lims, y = lims) + 
      facet_wrap( ~ base.model, ncol = 3) + 
      theme(legend.position = 'none', strip.text = element_text(size=7)) + 
      coord_equal()
  }

Elastic Net vs. Orphan

filter.by <- 'Log-rank (Test set)'
types     <- c('Elastic Net', 'Orphan')

values.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(c('base.model', types)) %>%
  mutate(aa = UQ(as.name(types[1])),
         bb = UQ(as.name(types[2]))) %>%
  group_by(base.model) %>%
  dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>%
  knitr::kable()

values.df %>% 
  filter(type == filter.by) %>% {
    lims <- c(min(.[types]),
              max(.[types]))
    ggplot(.) + 
      geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), 
                 alpha = .3) +
      expand_limits(x = lims, y = lims) + 
      facet_wrap( ~ base.model, ncol = 3) + 
      theme(legend.position = 'none', strip.text = element_text(size=7)) + 
      coord_equal()
  }

Hub vs. Orphan

filter.by <- 'Log-rank (Test set)'
types     <- c('Orphan', 'Hub')

values.df %>% 
  filter(type == filter.by) %>%
  dplyr::select(c('base.model', types)) %>%
  mutate(aa = UQ(as.name(types[1])),
         bb = UQ(as.name(types[2]))) %>%
  group_by(base.model) %>%
  dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>%
  knitr::kable()

values.df %>% 
  filter(type == filter.by) %>% {
    lims <- c(min(.[types]),
              max(.[types]))
    ggplot(.) + 
      geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), 
                 alpha = .3) +
      expand_limits(x = lims, y = lims) + 
      facet_wrap( ~ base.model, ncol = 3) + 
      theme(legend.position = 'none', strip.text = element_text(size=7)) + 
      coord_equal()
  }

Consesus genes

consensus <- list()
all.conse <- list()

all.conse.pvalue <- list()

var.names <- as.character(colnames(xdata[,xdata.ix]))

for (ix in ntimes.results) {
  if (is.list(ix)) {
    for (ix.2 in names(ix$coefs)) {
      if (ix$metrics$km.test[[ix.2]] < 0.05) {
        all.conse.pvalue[[ix.2]] <- c(all.conse.pvalue[[ix.2]], (ix$coefs[[ix.2]] %>% { 
          var.names[as.logical(. != 0)]
        } ))  
      }

      all.conse[[ix.2]] <- c(all.conse[[ix.2]], (ix$coefs[[ix.2]] %>% { 
        var.names[as.logical(. != 0)]
      } ))
    } 
  }
}
all.conse.tbl        <- lapply(all.conse, table)
all.conse.pvalue.tbl <- lapply(all.conse.pvalue, table)

Selected genes over all runs {.tabset .tabset-fade .tabset-pills}

Bar plots of frequencies {-}

melt(all.conse.tbl) %>%
  dplyr::mutate(L1.suffix = factor(prettify.labels(L1, 'string.base')),
         L1.prefix = factor(prettify.labels(L1, 'model'))) %>%
  dplyr::mutate(L1 = factor(L1)) %>%
  dplyr::group_by(L1.suffix) %>%
  dplyr::add_count(Var1) %>%
  filter(n > 1) %>%
  ggplot() +
  geom_col(aes(Var1, value, color = L1.prefix, fill = L1.prefix), alpha = .4) +
  facet_wrap( ~ L1.suffix, ncol = 3) +
  scale_x_discrete(breaks = c()) +
  scale_fill_discrete('Model') + scale_color_discrete('Model') +
  theme(legend.position = 'top', axis.text.x = element_blank()) +
  xlab('Genes') + ylab('Number of times gene is selected (stacked)')

Overlap between variables selected {-}

all.names  <- names(model.results$coefs)
all.names  <- all.names[grep('^(degree_new--)|^(orphan--)|^(glmnet--)', all.names)]
all.model  <- unique(gsub('^([a-zAZ]+)--(.+)$', '\\1', gsub('_new', '', all.names)))
all.origin <- gsub('^([a-zAZ]+)--(.+)$', '\\2', gsub('_new', '', all.names))

cat('\n\n')
for (type in unique(all.origin)) {
  cat('\n\n')
  cat('##### Base model:', prettify.labels(type, 'base.model'), 'with Target Vars:', prettify.labels(type, 'alpha'), 'and Alpha:', prettify.labels(type, 'target.vars'), ' {-}')
  cat('\n\n')
  ll <- list()
  for (ix.name in all.names[which(grepl(type, all.names))]) {
    label <- sprintf('%s (%d)', prettify.labels(ix.name, 'model'), length(all.conse.tbl[[ix.name]]))
    ll[[label]] <- sort(names(all.conse.tbl[[ix.name]]))
  }
  cat('\n\n')
  tryCatch({
    draw.3venn(ll, paste0(prettify.labels(type, 'base.model'), ' base model with Target Vars: ', prettify.labels(type, 'alpha'), ' and Alpha: ', prettify.labels(type, 'target.vars')))
    }, error = function(err) { flog.error('Probel with %s', err)})
  cat('\n\n')
}

Selected genes in significant runs {.tabset .tabset-fade .tabset-pills}

i.e. with pvalue < 0.05 in Log-rank test using the test set.

Bar plots of frequencies

if(length(all.conse.pvalue.tbl) == 0) {
  flog.warn('None of the models was statistical significant.')
} else {
  melt(all.conse.pvalue.tbl) %>%
    mutate(L1.suffix = factor(prettify.labels(L1, 'string.base')),
           L1.prefix = factor(prettify.labels(L1, 'model'))) %>%
    mutate(L1 = factor(L1)) %>%
    group_by(L1.suffix) %>%
    add_count(Var1) %>%
    filter(n > 1) %>%
    ggplot() +
    geom_col(aes(Var1, value, color = L1.prefix, fill = L1.prefix), alpha = .4) +
    facet_wrap( ~ L1.suffix, ncol = 3) +
    scale_x_discrete(breaks = c()) +
    scale_fill_discrete('Model') + scale_color_discrete('Model') +
    theme(legend.position = 'top', axis.text.x = element_blank()) +
    xlab('Genes') + ylab('Number of times gene is selected (stacked)')
}

Overlap between variables selected {.tabset .tabset-fade .tabset-pills}

if(length(all.conse.pvalue.tbl) == 0) {
  flog.warn('None of the models was statistical significant.')
} else {
  all.names  <- names(model.results$coefs)
  all.names  <- all.names[grep('^(degree_new--)|^(orphan--)|^(glmnet--)', all.names)]
  all.model  <- unique(gsub('^([a-zAZ]+)--(.+)$', '\\1', gsub('_new', '', all.names)))
  all.origin <- gsub('^([a-zAZ]+)--(.+)$', '\\2', gsub('_new', '', all.names))
  cat('\n\n')
  for (type in unique(all.origin)) {
    cat('\n\n')
    cat('##### Base model', prettify.labels(type, ret.value = 'string.base'))
    ll <- list()
    for (ix.name in all.names[which(grepl(type, all.names))]) {
      label <- sprintf('%s (%d)', prettify.labels(ix.name, 'model'), length(all.conse.pvalue.tbl[[ix.name]]))
      ll[[label]] <- sort(names(all.conse.pvalue.tbl[[ix.name]]))
    }
    tryCatch({
      draw.3venn(ll, prettify.labels(type, 'string.base'))
      }, error = function(err) { flog.error('Probel with %s', err)})
    cat('\n\n')
  }
}

Genes in Significant runs {.tabset .tabset-fade .tabset-pills}

i.e. with pvalue < 0.05 in Log-rank test using the test set.

Top consensus 20 genes {-}

overlapping.genes.names.tmp <- reshape2::melt(all.conse.pvalue.tbl) %>%
  dplyr::rename(Gene = Var1) %>% 
  dplyr::mutate(L1.suffix = factor(prettify.labels(L1, 'string.base')),
         L1.prefix = factor(prettify.labels(L1, 'model'))) %>%
  dplyr::mutate(L1 = factor(L1)) %>% 
  dplyr::group_by(L1.suffix) %>%
  dplyr::add_count(Gene, name = 'Overlap') %>% 
  dplyr::group_by(Gene, Overlap) %>% 
  dplyr::summarise(total = sum(value), .groups = 'drop') %>% 
  dplyr::arrange(-total)
  #dplyr::filter(total > 1000) %>% 

overlapping.genes.names <- loose.rock::run.cache(geneNames, overlapping.genes.names.tmp$Gene) %>% 
  dplyr::distinct() %>% 
  dplyr::right_join(overlapping.genes.names.tmp, by = c("ensembl_gene_id" = "Gene")) %>% 
  dplyr::rename(Gene = external_gene_name) %>% 
  dplyr::select(-ensembl_gene_id) %>% 
  dplyr::arrange(desc(total), desc(Overlap))

overlapping.genes.names  %>% 
  dplyr::top_n(20, total) %>% 
  knitr::kable()

All Genes {-}

cat('\n\n')
cat('<sub><sup>')
cat(overlapping.genes.names %>% 
  mutate(text = paste0(Gene, '(', total, ')')) %>% 
  pull(text) %>% 
  paste(collapse = ', '))
cat('</sup></sub>')
cat('\n\n')

Best C-index test in all runs {.tabset .tabset-fade .tabset-pills}

best.ones <- big.df %>%
  as.tibble() %>%
  filter(metric == 'C-Index (Test set)') %>%
  mutate(base.model = prettify.labels(model, 'string.base'),
         model.name = prettify.labels(model, 'model')) %>%
  group_by(model.name) %>%
  arrange(desc(values)) %>%
  dplyr::slice(1)

ydata.month <- ydata %>% mutate(time = time / 365 * 12)

Best Elastic Net model (C-Index test) {- .tabset .tabset-fade .tabset-pills}

filter.model <- 'Elastic Net'
best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)

Best Hub model (C-Index test) {- .tabset .tabset-fade .tabset-pills}

filter.model <- 'Hub'
best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)

Best Orphan model (C-Index test) {- .tabset .tabset-fade .tabset-pills}

filter.model <- 'Orphan'
best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)

Best Log-rank test in all runs {.tabset .tabset-fade .tabset-pills}

best.ones <- big.df %>%
  as.tibble() %>%
  filter(metric == 'Log-rank (Test set)') %>%
  mutate(base.model = prettify.labels(model, 'string.base'),
         model.name = prettify.labels(model, 'model')) %>%
  group_by(model.name) %>%
  arrange(values) %>%
  dplyr::slice(1)

ydata.month <- ydata %>% mutate(time = time / 365 * 12)

Best Elastic net (Log-rank test) {- .tabset .tabset-fade .tabset-pills}

filter.model <- 'Elastic Net'
best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)

Best Hub model (Log-rank test) {- .tabset .tabset-fade .tabset-pills}

filter.model <- 'Hub'
best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)

Best Orphan model (Log-rank test) {- .tabset .tabset-fade .tabset-pills}

filter.model <- 'Orphan'
best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)$plot

r if(nrow(big.df) == 0) {"-->"}

Parameters for the report

show.params(params)

rmarkdown::render('Analysis.Rmd', output_file = 'test-render.html', output_format = 'html_document')

rmarkdown::render('Analysis.Rmd', output_file = 'test-render', output_format = 'pdf_document')

averissimo/glmSparseNetPaper documentation built on Jan. 25, 2021, 12:11 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

averissimo/glmSparseNetPaper

In averissimo/glmSparseNetPaper:

Overview

Load and normalize data

Load TCGA data

Summary of data

Survival curve

Load degree data

Preparing degree vector

Genes with degree above 4000

Original degree frequency {.tabset .tabset-fade .tabset-pills}

Original {-}

LOG10 Scale on X-axis {-}

Model Inference

Train and test sets

Results

Relative risk distribution {.tabset .tabset-fade .tabset-pills}

Test Set {-}

Train Set {-}

Kaplan-Meier Curves

Models on test sets {.tabset .tabset-fade .tabset-pills}

Summary Table

Non-zero genes

Venn Diagram of selected genes

Overlap between models (with same target number of variables) {.tabset .tabset-fade .tabset-pills}

Table of genes in models {-}

Hallmarks of Cancer links

r params$ntimes Runs

C-Index distribution {.tabset .tabset-fade .tabset-pills}

Test set {- .tabset .tabset-fade .tabset-pills}

Summary of C-Index and Log-rank {-}

C-Index Distribution {-}

Rank {- .tabset .tabset-fade .tabset-pills}

Elastic Net vs Hub {-}

Elastic Net vs. Orphan {-}

Hub vs. Orphan {-}

Train set {- .tabset .tabset-fade .tabset-pills}

Summary of C-Index and Log-rank {-}

C-Index Distribution {-}

C-Index Rank {-}

Log-rank test on Kaplan-Meier models

Log-Rank distribution {.tabset .tabset-fade .tabset-pills}

Cumulative {-}

Distribution {-}

Log-Rank Rank {.tabset .tabset-fade .tabset-pills}

Elastic Net vs Hub

Elastic Net vs. Orphan

Hub vs. Orphan

Consesus genes

Selected genes over all runs {.tabset .tabset-fade .tabset-pills}

Bar plots of frequencies {-}

Overlap between variables selected {-}

Selected genes in significant runs {.tabset .tabset-fade .tabset-pills}

Bar plots of frequencies

Overlap between variables selected {.tabset .tabset-fade .tabset-pills}

Genes in Significant runs {.tabset .tabset-fade .tabset-pills}

Top consensus 20 genes {-}

All Genes {-}

Best C-index test in all runs {.tabset .tabset-fade .tabset-pills}

Best Elastic Net model (C-Index test) {- .tabset .tabset-fade .tabset-pills}

Best Hub model (C-Index test) {- .tabset .tabset-fade .tabset-pills}

Best Orphan model (C-Index test) {- .tabset .tabset-fade .tabset-pills}

Best Log-rank test in all runs {.tabset .tabset-fade .tabset-pills}

Best Elastic net (Log-rank test) {- .tabset .tabset-fade .tabset-pills}

Best Hub model (Log-rank test) {- .tabset .tabset-fade .tabset-pills}

Best Orphan model (Log-rank test) {- .tabset .tabset-fade .tabset-pills}

Parameters for the report

R Package Documentation

Browse R Packages

We want your feedback!

`r params$ntimes` Runs