Data
r params$project
r params$tissue
r if(params$coding.genes) {'Only using coding genes'} else {'Using all genes (coding and non-coding)'}
r if(!is.null(params$subtract.surv.column)){params$subtract.surv.column} else{'Using survival time from diagnostics'}
r params$degree.type
Normalization
r if (params$log2) { 'yes' } else { 'no' }
r params$normalization
Model options
r params$train * 100
%r paste(sapply(names(params$target.vars), function(ix) {sprintf('%s (%.2f / %d)', ix, params$target.vars[[ix]]$alpha, params$target.vars[[ix]]$vars)}), collapse = ', ')
Network processing
r params$degree.type
r params$degree.cutoff
r if (params$degree.unweighted) { 'Unweighted' } else { 'Weighted' }
edgescor/cov method: r params$degree.correlation
# ComBat(matrix and category_id) # plotMDS(matrix and some color stuff) # inSilicoDB is good to understand that and validate results my.out.width <- if (knitr::is_latex_output()) { '60%' } else if (knitr::is_html_output()) { '100%' } else { NULL } message_warning <- if (knitr::is_latex_output() || knitr::is_html_output()) { FALSE } else { TRUE } knitr::opts_chunk$set(echo = TRUE, collapse = TRUE, tidy = TRUE, out.width = my.out.width, warning=message_warning, message=message_warning)
# Libraries library(futile.logger) library(parallel) library(glmnet) library(survminer) library(loose.rock) library(digest) library(ggplot2) library(reshape2) library(survival) library(Vennerable) library(limma) library(tidyverse) library(forcats) library(biclust) # library(glmSparseNet) # In case classic glmnet is used library(doMC) registerDoMC(cores=params$n.cores) globalenv() %>% {.$global.n.cores <- params$n.cores } # devtools::load_all() # .Last.value <- base.dir(path = '/ssd_home/averissimo/work/rpackages/network.cox-cache') .Last.value <- show.message(FALSE) .Last.value <- flog.layout(layout.format('[~l] ~m')) .Last.value <- flog.appender(appender.tee(file.path('logs', sprintf('logger-analysis-%s.txt', format(Sys.time(), '%Y%m%d-%Hh%Mm%S'))))) theme_set(theme_minimal())
Data
r params$project
r params$tissue
r if(params$coding.genes) {'Only using coding genes'} else {'Using all genes (coding and non-coding)'}
r params$degree.type
r params$subtract.surv.column
Normalization
r if (params$log2) { 'yes' } else { 'no' }
r params$normalization
See prepare.tcga.survival.data
function for how to configure parameters.
This uses data packages from obtained from 'github:averissimo/tcga' that contain FPKM expression levels.
cat("```\n") cat('prepare.tcga.survival.data(\'', params$project, '\',\n', sep = '') cat(' \'', params$tissue, '\',\n', sep = '') cat(' normalization = \'', params$normalization, '\',\n', sep = '') cat(' log2.pre.normalize = ', params$log2, ',\n', sep = '') cat(' handle.duplicates = \'', params$handle.duplicates, '\',\n', sep = '') cat(' coding.genes = ', params$coding.genes, ',\n', sep = '') cat(' subtract.surv.column = \'', params$subtract.surv.column, '\')\n', sep = '') cat("```\n")
my.data <- run.cache(prepare.tcga.survival.data, params$project, params$tissue, # #input.type = 'rna', normalization = params$normalization, log2.pre.normalize = params$log2, # handle.duplicates = 'keep_first', coding.genes = TRUE, subtract.surv.column = params$subtract.surv.column, cache.prefix = 'tcga-data.new', show.message = TRUE) # clinical <- my.data$clinical # xdata <- my.data$xdata ydata <- my.data$ydata xdata.raw <- my.data$xdata.raw # # xdata.digest.cache <- my.data$xdata.digest xdata.raw.digest.cache <- my.data$xdata.raw.digest ydata.digest.cache <- my.data$ydata.digest # rm(my.data)
flog.info(' Loaded data from TCGA: %s', params$project) flog.info(' type of tissue: %s', params$tissue) flog.info(' observations (individuals): %d (%d event / %d censored)', nrow(xdata), sum(ydata$status), sum(!ydata$status)) flog.info(' variables (genes): %d', ncol(xdata))
ggsurv <- survfit(Surv(time, status) ~ 1, data = ydata %>% mutate(time = time/365*12)) %>% ggsurvplot(data = ydata, conf.int = FALSE, risk.table = TRUE, cumcensor = TRUE, xlab = "Time in months", tables.height = .1, surv.median.line = "hv", # add the median survival pointer. break.x.by = 12, #ncensor.plot = TRUE, ggtheme = theme_minimal()) # ggsurv$plot <- ggsurv$plot + theme(legend.position = 'none') #+ scale_x_continuous('x', breaks = seq(0, max((ydata %>% mutate(time = time/365*12))$time) + 20, 20)) ggsurv$ncensor.plot <- ggpar(ggsurv$ncensor.plot, font.y = c(0, "bold.italic", "darkgreen")) ggsurv$table <- ggpar(ggsurv$table, font.y = c(0, "bold.italic", "darkgreen")) ggsurv
# Test physiological Variables # *note:* Plots with p-value `<= 0.05` are shown. Two types of p-values are used as criteria: # 1. Univariate cox model # 1. Log rank test when separating between high and low risk group all.plots <- run.cache(test.all.clinical, clinical, ydata, cache.prefix = 'allplots') multiplot(plotlist = all.plots$plot.list, layout = all.plots$layout) knitr::kable(dplyr::arrange(dplyr::select(all.plots$df, name, p.value, desc), p.value))
Network processing
r params$degree.type
r params$degree.cutoff
r if (params$degree.unweighted) { 'Unweighted' } else { 'Weighted' }
edgescor/cov method: r params$degree.correlation
if (params$degree.type == 'oldie') { # # Load old matrix used in SPARSA conference load(sprintf('/home/averissimo/work/rpackages/brca.analysis/data/degree-%.6f.RData', params$degree.cutoff)) } else if (params$degree.type == 'correlation') { # # Load degree of network calculated summing the inverse of each weight degree <- degree.cor(xdata.raw, consider.unweighted = params$degree.unweighted, cutoff = params$degree.cutoff, method = params$degree.correlation, n.cores = params$n.cores) } else if (params$degree.type == 'covariance') { # # Load degree of network calculated using covariance degree <- degree.cov(xdata.raw, consider.unweighted = params$degree.unweighted, cutoff = params$degree.cutoff, method = params$degree.correlation, n.cores = params$n.cores) } else if (params$degree.type == 'sparsebn') { # # Load degree of network from sparsebn package (calulate bayesian network) degree <- degree.sparsebn(xdata.raw, cutoff = params$degree.cutoff, consider.unweighted = params$degree.unweighted, n.cores = params$n.cores) } else if (params$degree.type == 'string') { # # Load degree of STRING network downloaded from external sources degree <- run.cache(stringDBhomoSapiens, cache.prefix = "stringDB") %>% { run.cache(buildStringNetwork, ., use.names = 'ensembl', cache.prefix = 'string-network') } %>% { .@x[.@x <= params$degree.cutoff] <- 0 if (params$degree.unweighted) { .@x <- (.@x != 0) * 1 } Matrix::colSums(.) + Matrix::rowSums(.) } # # Adds missing genes as nodes with 0 degree degree <- colnames(xdata.raw)[!colnames(xdata.raw) %in% names(degree)] %>% { c(degree, array(0, length(.), dimnames = list(.))) } %>% { .[colnames(xdata.raw)] } } else if (params$degree.type == 'string.pedro') { # # Load degree of STRING network downloaded from external sources load('/ssd_home/averissimo/work/rpackages/network.cox-big.files/saves/degree-string.RData') ix.names <- colnames(xdata) %in% names(degree_uw) degree <- degree_uw[colnames(xdata)] } rm(xdata.raw) .Last.value <- gc(reset = TRUE, verbose = FALSE)
trans.fun
is a double power to scale the valuessee ?glmSparseNet::heuristicScale
or ?glmSparseNet::hubHeuristic
# see ?glmSparseNet::heuristic.scale trans.fun <- function(x) { heuristicScale(x) + 0.2}
original.penalty.factor <- degree names(original.penalty.factor) <- colnames(xdata) ########################## # # # SUPER IMPORTANT!!!! # # # ########################## original.penalty.factor[is.na(original.penalty.factor)] <- 0 norm.orig.penalty.factor <- original.penalty.factor / max(original.penalty.factor[!is.na(original.penalty.factor)], na.rm = TRUE) # # # DegreeCox (old and log(old)) # penalty.factor.degree.log <- penalty.factor.degree.old <- 1 / norm.orig.penalty.factor inf.ix <- is.infinite(penalty.factor.degree.old) # index for infinite values # log(old) log.penal <- log(penalty.factor.degree.old[!inf.ix]) + 1 penalty.factor.degree.log[!inf.ix] <- log.penal penalty.factor.degree.log[inf.ix] <- max(log.penal) + 1 # old non.log <- penalty.factor.degree.old[!inf.ix] penalty.factor.degree.old[inf.ix] <- max(non.log) + 1 # # DegreeCox heuristic # tmp.degree_new <- norm.orig.penalty.factor tmp.degree_new[is.na(tmp.degree_new)] <- 0 penalty.factor.degree.new <-trans.fun(1 - tmp.degree_new) # # OrphanCox # tmp.orphan <- norm.orig.penalty.factor tmp.orphan[is.na(tmp.orphan)] <- 1 penalty.factor.orphan <- trans.fun(tmp.orphan) my.df <- data.frame(ix = seq_along(penalty.factor.degree.old), penalty.factor.degree.old = penalty.factor.degree.old, penalty.factor.degree.new = penalty.factor.degree.new, penalty.factor.degree.log = penalty.factor.degree.log, penalty.factor.orphan = penalty.factor.orphan, original = original.penalty.factor)
top4000 <- data.frame(ensembl_gene_id = names(original.penalty.factor), degree = original.penalty.factor, stringsAsFactors = FALSE) %>% arrange(desc(degree)) %>% filter(degree > 4000) top4000.names <- geneNames(top4000$ensembl_gene_id) top4000 %>% inner_join(top4000.names, by='ensembl_gene_id') %>% knitr::kable()
transf.d <- reshape2::melt(my.df[,c('ix', 'original', 'penalty.factor.degree.old', 'penalty.factor.degree.log', 'penalty.factor.degree.new', 'penalty.factor.orphan')] %>% as_tibble, id.vars = c('ix'), variable.name = 'Type') levels(transf.d$Type) <- levels(transf.d$Type) %>% gsub('original', 'Original', .) %>% gsub('penalty.factor.degree.old', 'Degree (old)', .) %>% gsub('penalty.factor.degree.log', 'Degree (log(old))', .) %>% gsub('penalty.factor.degree.new', 'Degree (heuristic)', .) %>% gsub('penalty.factor.orphan', 'Orphan', .) if (!params$calc.params.old) { transf.d <- transf.d %>% filter(Type != 'Degree (old)') %>% filter(Type != 'Degree (log(old))') } #degree.factor.freq.plot <- ggplot(transf.d) + geom_freqpoly(aes(value, color = Type), alpha = 0.8, bins = 200, size = 1.5) + theme_minimal() + theme(legend.position = 'none') + facet_wrap( ~ Type, ncol = 1, scale = 'free_x') + scale_y_continuous(trans = 'log10') + xlab('Penalty') + ylab('Frequency count (log10 scale)') + ggtitle('Non scale on X')
transf.d <- reshape2::melt(my.df[,c('ix', 'original', 'penalty.factor.degree.old', 'penalty.factor.degree.log', 'penalty.factor.degree.new', 'penalty.factor.orphan')], id.vars = c('ix'), variable.name = 'Type') levels(transf.d$Type) <- levels(transf.d$Type) %>% gsub('original', 'Original', .) %>% gsub('penalty.factor.degree.old', 'Degree (old)', .) %>% gsub('penalty.factor.degree.log', 'Degree (log(old))', .) %>% gsub('penalty.factor.degree.new', 'Degree (heuristic)', .) %>% gsub('penalty.factor.orphan', 'Orphan', .) if (!params$calc.params.old) { transf.d <- transf.d %>% filter(Type != 'Degree (old)') %>% filter(Type != 'Degree (log(old))') } ggplot(transf.d) + geom_freqpoly(aes(value, color = Type), alpha = 0.8, bins = 200, size = 1.5) + theme_minimal() + theme(legend.position = 'none') + facet_wrap( ~ Type, ncol = 1, scale = 'free_x') + scale_y_continuous(trans = 'log10') + scale_x_continuous(trans = 'log10') + xlab('Penalty (log10 scale)') + ylab('Frequency count (log10 scale)') + ggtitle('Log scale on X')
calc.params <- params[c('subset', 'train', 'calc.params.old', 'seed', 'target.vars', 'balanced.sets')] model.results <- run.cache(call.results, xdata, ydata, penalty.factor.degree.new, penalty.factor.degree.old, penalty.factor.orphan, calc.params, no.plot = TRUE, no.models = TRUE, cache.digest = list(xdata.digest.cache, ydata.digest.cache), cache.prefix = 'big-diff') # xdata.ix <- model.results$xdata.ix varnames <- model.results$varnames coefs <- model.results$coefs result <- model.results$result lambdas <- model.results$lambdas km.train <- model.results$metric$km.train km.test <- model.results$metric$km.test c.index.train <- model.results$metrics$c.index.train c.index.test <- model.results$metrics$c.index.test
xdata.test <- xdata[model.results$ixs$test,] ydata.test <- ydata[model.results$ixs$test,] # xdata.train <- xdata[model.results$ixs$train,] ydata.train <- ydata[model.results$ixs$train,] flog.info('Size of sets: (size/events)\n * Train: %.2f%% :: %4d / %4d\n * Test: %.2f%% :: %4d /%4d', params$train * 100, nrow(xdata.train), sum(ydata.train$status), (1 - params$train) * 100, nrow(xdata.test), sum(ydata.test$status))
flog.info('Number of variables per model:') lapply(model.results$coefs, function(ix){ sum(ix != 0) }) %>% melt() %>% dplyr::rename(nvars = value, Model = L1) %>% mutate(BaseModel = prettify.labels(Model, 'base.model'), TargetVars = prettify.labels(Model, 'target.vars'), Alpha = prettify.labels(Model, 'alpha'), Model = prettify.labels(Model, 'model')) %>% dplyr::select(Model, BaseModel, Alpha, TargetVars, nvars) %>% knitr::kable() flog.info('note, selected variables could be slightly different from target, to have more accuracy increase nlambda in code')
Calculated using the inferred models and the train/test datasets. The higher the better.
this.list <- list() for (ix.name in names(model.results$coefs)) { this.list[[ix.name]] <- list(model = model.results$coefs[[ix.name]], target = lambdas[[ix.name]]) } fit.df <- run.cache(build.fit.df, this.list, xdata.train, xdata.test, xdata.ix, cache.prefix = 'build.fit.df')
show.relative.risk.distribution(fit.df, 'Test')
show.relative.risk.distribution(fit.df, 'Train')
my.km.train <- list() my.km.test <- list() ix <- 1 for (ix.name in names(model.results$coefs)) { nvars <- sum(0 != model.results$coefs[[ix.name]]) ix.alpha <- prettify.labels(ix.name, 'alpha') %>% as.double my.km.train[[ix]] <- my.draw.kaplan(list(ix.name = coefs[[ix.name]]), xdata.train[, xdata.ix], ydata.train, plot.title = 'Train set')$plot + km.name('Train', ix.alpha, ix.name, nvars) my.km.train[[ix]]$base.model <- ix.name %>% prettify.labels(ret.value = 'base.model') my.km.train[[ix]]$model <- ix.name %>% prettify.labels(ret.value = 'model') my.km.test[[ix]] <- my.draw.kaplan(list(ix.name = coefs[[ix.name]]), xdata.test[, xdata.ix], ydata.test, plot.title = 'Test set')$plot + km.name('Test', ix.alpha, ix.name, nvars) my.km.test[[ix]]$base.model <- ix.name %>% prettify.labels(ret.value = 'base.model') my.km.test[[ix]]$model <- ix.name %>% prettify.labels(ret.value = 'model') ix <- ix + 1 }
cat('\n\n') for(ix in seq_along(my.km.train)) { cat('#### ', my.km.test[[ix]]$model, ' (base model: ', my.km.test[[ix]]$base.model, ') {- .tabset .tabset-fade .tabset-pills}', sep = '') cat('\n\n') cat('##### Test set {-}') cat('\n\n') print(my.km.test[[ix]]$plot) cat('\n\n') cat('##### Training set {-}') cat('\n\n') print(my.km.train[[ix]]$plot) cat('\n\n') }
table.data <- data.frame() for (ix.name in names(model.results$coefs)) { table.data <- rbind(table.data, data.frame( weighted = !params$degree.unweighted, penalization = params$degree.type, project = params$project, tissue = params$tissue, cutoff = params$degree.cutoff, coding.genes = params$coding.genes, alpha = prettify.labels(ix.name, 'alpha'), model = prettify.labels(ix.name, 'model'), nvars = sum(model.results$coefs[[ix.name]] != 0), km.train = km.train[[ix.name]], km.test = km.test[[ix.name]], c.index.train = c.index.train[[ix.name]], c.index.test = c.index.test[[ix.name]])) } knitr::kable(table.data)
coefs.v <- list() non.zero.df.result <- NULL for (ix.name in names(model.results$coefs)){ # flog.info('Working on %s', ix.name) coefs.v[[ix.name]] <- model.results$coefs[[ix.name]] non.zero.ix <- which(coefs.v[[ix.name]] != 0) # names.tmp <- varnames[non.zero.ix] coef.tmp <- coefs.v[[ix.name]][non.zero.ix] if (length(coefs.v) == 1) { # flog.info(' * Len == 0') non.zero.df.result <- data.frame(gene.id = names.tmp, coef = coef.tmp) first.name <- ix.name } else { # flog.info(' * Len > 0') non.zero.df2 <- data.frame(gene.id = names.tmp, coef = coef.tmp) suffix <- c('', sprintf('.%s', ix.name)) non.zero.df.result <- dplyr::full_join(non.zero.df.result, non.zero.df2, by = c('gene.id'), suffix = suffix) } } colnames(non.zero.df.result)[2] <- sprintf('coef.%s', first.name) # non.zero.df.out <- cbind(non.zero.df.result, degree = as.vector(original.penalty.factor[non.zero.df.result$gene.id])) gene.ids <- non.zero.df.out$gene.id
gene.names <- geneNames(gene.ids) new.t <- non.zero.df.out ixs <- gene.names[unlist(sapply(new.t$gene.id, function(s) { which(gene.names$ensembl_gene_id == s) })),] new.t$gene.id[new.t$gene.id %in% ixs$ensembl_gene_id] <- ixs$external_gene_name gene.ids <- non.zero.df.out$gene.id non.zero.df.out$gene.id <- gsub('ENSG0*', 'ENSG', non.zero.df.out$gene.id) non.zero.df.out$gene.name <- new.t$gene.id col.ix <- colnames(non.zero.df.out) %in% c('gene.id', 'gene.name', 'degree') non.zero.df.out <- non.zero.df.out[,c(which(col.ix), which(!col.ix))]
all.names <- names(model.results$coefs) all.names <- all.names[grep('^(degree_new--)|^(orphan--)|^(glmnet--)', all.names)] all.model <- unique(gsub('^([a-zAZ]+)--(.+)$', '\\1', gsub('_new', '', all.names))) all.origin <- gsub('^([a-zAZ]+)--(.+)$', '\\2', gsub('_new', '', all.names)) cat('\n\n') for (type.ix in seq_along(unique(all.origin))) { type <- unique(all.origin)[type.ix] cat('\n\n') cat('##### Base model:', prettify.labels(type, 'base.model'), 'with Target Vars:', prettify.labels(type, 'alpha'), 'and Alpha:', prettify.labels(type, 'target.vars'), ' {-}') cat('\n\n') ll <- list() for (ix.name in all.names[which(grepl(type, all.names))]) { label <- sprintf('%s (%d)', prettify.labels(ix.name, 'model'), sum(!is.na(non.zero.df.out[, sprintf('coef.%s', ix.name)]))) ll[[label]] <- sort(non.zero.df.out$gene.id[!is.na(non.zero.df.out[, sprintf('coef.%s', ix.name)])]) } tryCatch({ draw.3venn(ll, paste0(prettify.labels(type, 'base.model'), ' base model with Target Vars: ', prettify.labels(type, 'alpha'), ' and Alpha: ', prettify.labels(type, 'target.vars'))) #vv <- Venn(ll) #gridExtra::grid.arrange(grid::grid.grabExpr(plot(vv, doWeights = FALSE)), top= prettify.labels(type, 'string')) }, error = function(err) { flog.error('Probel with %s', err)}) cat('\n\n') }
(not running at the moment)
knitr::kable(non.zero.df.out)
my.hallmarks <- hallmarks(non.zero.df.out$gene.name) df.no.hallmarks <- data.frame(gene.name = my.hallmarks$no.hallmakrs, stringsAsFactors = FALSE) for (ix.name in names(model.results$coefs)) { df.no.hallmarks[, ix.name] <- (df.no.hallmarks$gene.name %in% non.zero.df.out[!is.na(non.zero.df.out[[paste0('coef.',ix.name)]]), 'gene.name']) * 1 } melt(df.no.hallmarks, id.vars = 'gene.name') %>% mutate(variable = prettify.labels(variable, 'string.short')) %>% filter(value > 0) %>% ggplot(aes(gene.name,variable, fill=value)) + geom_raster() + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position = 'none')
#*(not running at the moment) df.scaled <- my.hallmarks$hallmarks df.scaled <- data.frame(gene.name = rownames(df.scaled), df.scaled, stringsAsFactors = FALSE) for (ix.name in names(model.results$coefs)) { df.scaled[, ix.name] <- (df.scaled$gene.name %in% non.zero.df.out[!is.na(non.zero.df.out[[paste0('coef.',ix.name)]]), 'gene.name']) * 1 } for (ix.name in names(model.results$coefs)) { if (sum(df.scaled[[ix.name]] == 1) == 0) { flog.warn('No hallmark genes for: %s', ix.name) next } df.scaled.filter <- df.scaled[df.scaled[[ix.name]] == 1, !colnames(df.scaled) %in% names(model.results$coefs)] df.melt <- melt(df.scaled.filter, id.vars = c('gene.name')) %>% filter(value > 0) print( ggplot(df.melt, aes(gene.name,variable, fill=value)) + geom_raster() + theme(axis.text.x = element_text(angle = 90, hjust = 1)) + ggtitle(prettify.labels(ix.name, 'string.short')) + ggplot2::xlab('External Gene Name') + ggplot2::ylab('') + ggplot2::scale_fill_gradientn(colours = rev(grDevices::topo.colors(2))) ) }
r params$ntimes
Runsif (params$train >= 1) { flog.info("Train and test sets are the same, won't calculate %d times with random seeds", params$train) big.df <- data.frame() } else { suppressWarnings(rm(xdata.train, xdata.test, ydata.train, ydata.test)) new.params <- calc.params new.params$ntimes <- max(c(1, params$ntimes)) result.env <- new.env() if (grepl('brca', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/1e53/cache-multiple.runs-H_1e53e21c7ba906c89169bacf3da11ae90369d3743c8ba78ec5867af3c1ebc882.RData', envir = result.env) # 960 - BRCA } else if (grepl('kirc', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/8e28/cache-multiple.runs-H_8e2877da3b6a9792325bf66799a6afe5610fec894bea56decd50beb30a8a165d.RData', envir = result.env) # 463 - KIRC } else if (grepl('', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/90ce/cache-multiple.runs-H_90ce3dd68b7db8d9b7bb4416c32f5d53b2ca8b89a85f4510b814ed61fe7f4ccd.RData', envir = result.env) # 445 - PRAD } else if (grepl('luad', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/aaf0/cache-multiple.runs-H_aaf0da66649359d48a743131b629fc2196ba00f5ee6f27888d31401631ea1b69.RData', envir = result.env) # 451 - LUAD } else if (grepl('ov', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/7324/cache-multiple.runs-H_73244f69140486d2e070daa1094a3645b1e41b5968de8c19400729e66ffe045b.RData', envir = result.env) # 370 - OV } else if (grepl('lgg', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/bf2f/cache-multiple.runs-H_bf2ffd249afb1ac28b7940e8922e4a15d804a2422798b1299da6cdfb7fd56940.RData', envir = result.env) # 448 - LGG } else if (grepl('skcm', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/cf52/cache-multiple.runs-H_cf52fc811e3de59c3a0db1e940807ca4b06e506bef317476f026e36e48cca234.RData', envir = result.env) # 351 - skcm } else if (grepl('ucec', params$project)) { load('/ssd_home/averissimo/work/rpackages/network.cox-cache/d9e4/cache-multiple.runs-H_d9e4cfbb3055b54907b8e2b0370adc5db7aaaf68137001899b31083a9ca16d92.RData', envir = result.env) # 517 - UCEC } else { result.env$result <- run.cache(build.multiple.runs, new.params, xdata, ydata, penalty.factor.degree.new, penalty.factor.degree.old, penalty.factor.orphan, xdata.digest.cache, ydata.digest.cache, # cache.prefix = 'multiple.runs') } ntimes.results.list <- result.env$result rm(result) ntimes.results <- ntimes.results.list$ntimes.results varnames <- ntimes.results.list$varnames multiple.runs <- run.cache(handle.multiple, ntimes.results, cache.prefix = 'handle.multiple') big.df <- multiple.runs$big.df rank.df <- multiple.runs$rank.df values.df <- multiple.runs$values.df rm(multiple.runs) }
r if(nrow(big.df) == 0) {"<!--"}
big.df %>% dplyr::filter(grepl('Test set', metric)) %>% dplyr::group_by(metric, model) %>% dplyr::summarise(mean = round(mean(values),3), std = round(sd(values),3), median = round(median(values),3), min = format(min(values), digits = 3), .groups = 'keep') %>% dplyr::mutate(base.model = prettify.labels(model, 'base.model'), target.vars = prettify.labels(model, 'target.vars'), alpha = prettify.labels(model, 'alpha'), model = prettify.labels(model, 'model')) %>% dplyr::select(metric, model, base.model, target.vars, alpha, mean, std, median, min) %>% knitr::kable()
big.df %>% mutate(base.model = prettify.labels(model, 'string.base'), string.model = prettify.labels(model, 'string.short'), model.name = prettify.labels(model, 'model')) %>% #filter(model %in% c('glmnet.classic.cv.28', 'degree_new.classic.cv.28', 'orphan.classic.cv.28')) %>% filter(metric %in% c('C-Index (Test set)')) %>% ggplot() + #geom_freqpoly(aes(values, color = model.name), alpha = .75, bins = 100) + #stat_density(aes(values, color = model.name),position="identity",geom="line", adjust = 1/5) + geom_density(aes(values, color = model.name, fill = model.name), alpha = .1, adjust = 1/5) + facet_wrap( ~ base.model, ncol = 3) + ggtitle('Full distribution C-Index (Test set)') + theme_minimal() + theme(legend.position = 'bottom')
filter.by <- 'C-Index (Test set)' rank.df %>% filter(type == filter.by) %>% dplyr::select(base.model, X1, X2, X3) %>% reshape2::melt(id.vars = 'base.model') %>% { ggplot(.) + geom_histogram(aes(value, fill = variable), stat = 'count') + ggtitle(filter.by, subtitle = 'Ranks for each of models, X1 is best, Xn is the worst') + theme(legend.position = 'right') + facet_wrap( ~ base.model, ncol = 1) }
filter.by <- 'C-Index (Test set)' types <- c('Elastic Net', 'Hub') values.df %>% filter(type == filter.by) %>% dplyr::select(c('base.model', types)) %>% mutate(aa = UQ(as.name(types[1])), bb = UQ(as.name(types[2]))) %>% group_by(base.model) %>% dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>% knitr::kable() values.df %>% filter(type == filter.by) %>% { lims <- c(min(.[types]), max(.[types])) ggplot(.) + geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), alpha = .3) + expand_limits(x = lims, y = lims) + facet_wrap( ~ base.model, ncol = 3) + theme(legend.position = 'none') + coord_equal() }
filter.by <- 'C-Index (Test set)' types <- c('Elastic Net', 'Orphan') values.df %>% filter(type == filter.by) %>% dplyr::select(c('base.model', types)) %>% mutate(aa = UQ(as.name(types[1])), bb = UQ(as.name(types[2]))) %>% group_by(base.model) %>% dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>% knitr::kable() values.df %>% filter(type == filter.by) %>% { lims <- c(min(.[types]), max(.[types])) ggplot(.) + geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), alpha = .3) + expand_limits(x = lims, y = lims) + facet_wrap( ~ base.model, ncol = 3) + theme(legend.position = 'none') + coord_equal() }
filter.by <- 'C-Index (Test set)' types <- c('Orphan', 'Hub') values.df %>% filter(type == filter.by) %>% dplyr::select(c('base.model', types)) %>% mutate(aa = UQ(as.name(types[1])), bb = UQ(as.name(types[2]))) %>% group_by(base.model) %>% dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>% knitr::kable() values.df %>% filter(type == filter.by) %>% { lims <- c(min(.[types]), max(.[types])) ggplot(.) + geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), alpha = .3) + expand_limits(x = lims, y = lims) + facet_wrap( ~ base.model, ncol = 3) + theme(legend.position = 'none') + coord_equal() }
big.df %>% dplyr::filter(grepl('Train set', metric)) %>% dplyr::group_by(metric, model) %>% dplyr::summarise(mean = round(mean(values),3), std = round(sd(values),3), median = round(median(values),3), min = format(min(values), digits = 3), .groups = 'keep') %>% dplyr::mutate(base.model = prettify.labels(model, 'base.model'), target.vars = prettify.labels(model, 'target.vars'), alpha = prettify.labels(model, 'alpha'), model = prettify.labels(model, 'model')) %>% dplyr::select(metric, model, base.model, target.vars, alpha, mean, std, median, min) %>% knitr::kable()
big.df %>% mutate(base.model = prettify.labels(model, 'string.base'), string.model = prettify.labels(model, 'string.short'), model.name = prettify.labels(model, 'model')) %>% filter(metric %in% c('C-Index (Train set)')) %>% ggplot() + #geom_freqpoly(aes(values, color = model.name), alpha = .75, bins = 100) + #stat_density(aes(values, color = model.name),position="identity",geom="line", adjust = 1/5) + geom_density(aes(values, color = model.name, fill = model.name), alpha = .1, adjust = 1/5) + facet_wrap( ~ base.model, ncol = 3) + ggtitle('Full distribution C-Index (Train set)') + theme_minimal() + theme(legend.position = 'bottom', strip.text = element_text(size=7))
filter.by <- 'C-Index (Train set)' rank.df %>% filter(type == filter.by) %>% dplyr::select(base.model, X1, X2, X3) %>% reshape2::melt(id.vars = 'base.model') %>% { ggplot(.) + geom_histogram(aes(value, fill = variable), stat = 'count') + ggtitle(filter.by, subtitle = 'Ranks for each of models, X1 is best, Xn is the worst') + theme(legend.position = 'right') + facet_wrap( ~ base.model, ncol = 1) }
Distribution of Log-rank test with groups separated by high and low risk groups
big.df %>% mutate(base.model = prettify.labels(model, 'string.base'), string.model = prettify.labels(model, 'string.short'), model.name = prettify.labels(model, 'model')) %>% filter(metric %in% c('Log-rank (Test set)')) %>% ggplot() + geom_vline(xintercept = .05, linetype = 2, alpha = .2) + annotate("text", x = 0.05, y = Inf, label = ' 0.05', alpha = .2, hjust = 0, vjust = 2) + stat_ecdf(aes(values, color = model.name)) + facet_wrap( ~ base.model, ncol = 1) + ggtitle('Cumulative distribution Log-rank') + theme_minimal() + theme(legend.position = 'bottom')
Distribution of Log-rank test with groups separated by high and low risk groups
big.df %>% mutate(base.model = prettify.labels(model, 'string.base'), string.model = prettify.labels(model, 'string.short'), model.name = prettify.labels(model, 'model')) %>% filter(metric %in% c('Log-rank (Test set)')) %>% ggplot() + geom_vline(xintercept = .05, linetype = 2, alpha = .2) + annotate("text", x = 0.05, y = Inf, label = ' 0.05', alpha = .2, hjust = 0, vjust = 2) + #geom_freqpoly(aes(values, color = model.name), alpha = .75, bins = 100) + #stat_density(aes(values, color = model.name),position="identity",geom="line", adjust = 1/5) + geom_density(aes(values, color = model.name, fill = model.name), alpha = .1, adjust = 1/5) + facet_wrap( ~ base.model, ncol = 3) + ggtitle('Full distribution Log-rank') + theme_minimal() + theme(legend.position = 'bottom', strip.text = element_text(size=7))
filter.by <- 'Log-rank (Test set)' rank.df %>% filter(type == filter.by) %>% dplyr::select(base.model, X1, X2, X3) %>% reshape2::melt(id.vars = 'base.model') %>% { ggplot(.) + geom_histogram(aes(value, fill = variable), stat = 'count') + ggtitle(filter.by, subtitle = 'Ranks for each of models, X1 is best, Xn is the worst') + theme(legend.position = 'right') + facet_wrap( ~ base.model, ncol = 1) }
filter.by <- 'Log-rank (Test set)' types <- c('Elastic Net', 'Hub') values.df %>% filter(type == filter.by) %>% dplyr::select(c('base.model', types)) %>% mutate(aa = UQ(as.name(types[1])), bb = UQ(as.name(types[2]))) %>% group_by(base.model) %>% dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>% knitr::kable() values.df %>% filter(type == filter.by) %>% { lims <- c(min(.[types]), max(.[types])) ggplot(.) + geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), alpha = .3) + expand_limits(x = lims, y = lims) + facet_wrap( ~ base.model, ncol = 3) + theme(legend.position = 'none', strip.text = element_text(size=7)) + coord_equal() }
filter.by <- 'Log-rank (Test set)' types <- c('Elastic Net', 'Orphan') values.df %>% filter(type == filter.by) %>% dplyr::select(c('base.model', types)) %>% mutate(aa = UQ(as.name(types[1])), bb = UQ(as.name(types[2]))) %>% group_by(base.model) %>% dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>% knitr::kable() values.df %>% filter(type == filter.by) %>% { lims <- c(min(.[types]), max(.[types])) ggplot(.) + geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), alpha = .3) + expand_limits(x = lims, y = lims) + facet_wrap( ~ base.model, ncol = 3) + theme(legend.position = 'none', strip.text = element_text(size=7)) + coord_equal() }
filter.by <- 'Log-rank (Test set)' types <- c('Orphan', 'Hub') values.df %>% filter(type == filter.by) %>% dplyr::select(c('base.model', types)) %>% mutate(aa = UQ(as.name(types[1])), bb = UQ(as.name(types[2]))) %>% group_by(base.model) %>% dplyr::summarise(r.squared = (caret::postResample(pred = aa, obs = bb)['Rsquared']), .groups = 'keep') %>% knitr::kable() values.df %>% filter(type == filter.by) %>% { lims <- c(min(.[types]), max(.[types])) ggplot(.) + geom_point(aes_(x = as.name(types[1]), y = as.name(types[2]), color = as.name('base.model')), alpha = .3) + expand_limits(x = lims, y = lims) + facet_wrap( ~ base.model, ncol = 3) + theme(legend.position = 'none', strip.text = element_text(size=7)) + coord_equal() }
consensus <- list() all.conse <- list() all.conse.pvalue <- list() var.names <- as.character(colnames(xdata[,xdata.ix])) for (ix in ntimes.results) { if (is.list(ix)) { for (ix.2 in names(ix$coefs)) { if (ix$metrics$km.test[[ix.2]] < 0.05) { all.conse.pvalue[[ix.2]] <- c(all.conse.pvalue[[ix.2]], (ix$coefs[[ix.2]] %>% { var.names[as.logical(. != 0)] } )) } all.conse[[ix.2]] <- c(all.conse[[ix.2]], (ix$coefs[[ix.2]] %>% { var.names[as.logical(. != 0)] } )) } } } all.conse.tbl <- lapply(all.conse, table) all.conse.pvalue.tbl <- lapply(all.conse.pvalue, table)
melt(all.conse.tbl) %>% dplyr::mutate(L1.suffix = factor(prettify.labels(L1, 'string.base')), L1.prefix = factor(prettify.labels(L1, 'model'))) %>% dplyr::mutate(L1 = factor(L1)) %>% dplyr::group_by(L1.suffix) %>% dplyr::add_count(Var1) %>% filter(n > 1) %>% ggplot() + geom_col(aes(Var1, value, color = L1.prefix, fill = L1.prefix), alpha = .4) + facet_wrap( ~ L1.suffix, ncol = 3) + scale_x_discrete(breaks = c()) + scale_fill_discrete('Model') + scale_color_discrete('Model') + theme(legend.position = 'top', axis.text.x = element_blank()) + xlab('Genes') + ylab('Number of times gene is selected (stacked)')
all.names <- names(model.results$coefs) all.names <- all.names[grep('^(degree_new--)|^(orphan--)|^(glmnet--)', all.names)] all.model <- unique(gsub('^([a-zAZ]+)--(.+)$', '\\1', gsub('_new', '', all.names))) all.origin <- gsub('^([a-zAZ]+)--(.+)$', '\\2', gsub('_new', '', all.names)) cat('\n\n') for (type in unique(all.origin)) { cat('\n\n') cat('##### Base model:', prettify.labels(type, 'base.model'), 'with Target Vars:', prettify.labels(type, 'alpha'), 'and Alpha:', prettify.labels(type, 'target.vars'), ' {-}') cat('\n\n') ll <- list() for (ix.name in all.names[which(grepl(type, all.names))]) { label <- sprintf('%s (%d)', prettify.labels(ix.name, 'model'), length(all.conse.tbl[[ix.name]])) ll[[label]] <- sort(names(all.conse.tbl[[ix.name]])) } cat('\n\n') tryCatch({ draw.3venn(ll, paste0(prettify.labels(type, 'base.model'), ' base model with Target Vars: ', prettify.labels(type, 'alpha'), ' and Alpha: ', prettify.labels(type, 'target.vars'))) }, error = function(err) { flog.error('Probel with %s', err)}) cat('\n\n') }
i.e. with pvalue < 0.05
in Log-rank test using the test set.
if(length(all.conse.pvalue.tbl) == 0) { flog.warn('None of the models was statistical significant.') } else { melt(all.conse.pvalue.tbl) %>% mutate(L1.suffix = factor(prettify.labels(L1, 'string.base')), L1.prefix = factor(prettify.labels(L1, 'model'))) %>% mutate(L1 = factor(L1)) %>% group_by(L1.suffix) %>% add_count(Var1) %>% filter(n > 1) %>% ggplot() + geom_col(aes(Var1, value, color = L1.prefix, fill = L1.prefix), alpha = .4) + facet_wrap( ~ L1.suffix, ncol = 3) + scale_x_discrete(breaks = c()) + scale_fill_discrete('Model') + scale_color_discrete('Model') + theme(legend.position = 'top', axis.text.x = element_blank()) + xlab('Genes') + ylab('Number of times gene is selected (stacked)') }
if(length(all.conse.pvalue.tbl) == 0) { flog.warn('None of the models was statistical significant.') } else { all.names <- names(model.results$coefs) all.names <- all.names[grep('^(degree_new--)|^(orphan--)|^(glmnet--)', all.names)] all.model <- unique(gsub('^([a-zAZ]+)--(.+)$', '\\1', gsub('_new', '', all.names))) all.origin <- gsub('^([a-zAZ]+)--(.+)$', '\\2', gsub('_new', '', all.names)) cat('\n\n') for (type in unique(all.origin)) { cat('\n\n') cat('##### Base model', prettify.labels(type, ret.value = 'string.base')) ll <- list() for (ix.name in all.names[which(grepl(type, all.names))]) { label <- sprintf('%s (%d)', prettify.labels(ix.name, 'model'), length(all.conse.pvalue.tbl[[ix.name]])) ll[[label]] <- sort(names(all.conse.pvalue.tbl[[ix.name]])) } tryCatch({ draw.3venn(ll, prettify.labels(type, 'string.base')) }, error = function(err) { flog.error('Probel with %s', err)}) cat('\n\n') } }
i.e. with pvalue < 0.05
in Log-rank test using the test set.
overlapping.genes.names.tmp <- reshape2::melt(all.conse.pvalue.tbl) %>% dplyr::rename(Gene = Var1) %>% dplyr::mutate(L1.suffix = factor(prettify.labels(L1, 'string.base')), L1.prefix = factor(prettify.labels(L1, 'model'))) %>% dplyr::mutate(L1 = factor(L1)) %>% dplyr::group_by(L1.suffix) %>% dplyr::add_count(Gene, name = 'Overlap') %>% dplyr::group_by(Gene, Overlap) %>% dplyr::summarise(total = sum(value), .groups = 'drop') %>% dplyr::arrange(-total) #dplyr::filter(total > 1000) %>% overlapping.genes.names <- loose.rock::run.cache(geneNames, overlapping.genes.names.tmp$Gene) %>% dplyr::distinct() %>% dplyr::right_join(overlapping.genes.names.tmp, by = c("ensembl_gene_id" = "Gene")) %>% dplyr::rename(Gene = external_gene_name) %>% dplyr::select(-ensembl_gene_id) %>% dplyr::arrange(desc(total), desc(Overlap)) overlapping.genes.names %>% dplyr::top_n(20, total) %>% knitr::kable()
cat('\n\n') cat('<sub><sup>') cat(overlapping.genes.names %>% mutate(text = paste0(Gene, '(', total, ')')) %>% pull(text) %>% paste(collapse = ', ')) cat('</sup></sub>') cat('\n\n')
best.ones <- big.df %>% as.tibble() %>% filter(metric == 'C-Index (Test set)') %>% mutate(base.model = prettify.labels(model, 'string.base'), model.name = prettify.labels(model, 'model')) %>% group_by(model.name) %>% arrange(desc(values)) %>% dplyr::slice(1) ydata.month <- ydata %>% mutate(time = time / 365 * 12)
filter.model <- 'Elastic Net' best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)
filter.model <- 'Hub' best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)
filter.model <- 'Orphan' best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)
best.ones <- big.df %>% as.tibble() %>% filter(metric == 'Log-rank (Test set)') %>% mutate(base.model = prettify.labels(model, 'string.base'), model.name = prettify.labels(model, 'model')) %>% group_by(model.name) %>% arrange(values) %>% dplyr::slice(1) ydata.month <- ydata %>% mutate(time = time / 365 * 12)
filter.model <- 'Elastic Net' best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)
filter.model <- 'Hub' best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)
filter.model <- 'Orphan' best.model.print(filter.model, best.ones, xdata, ydata.month, ntimes.results)$plot
r if(nrow(big.df) == 0) {"-->"}
show.params(params)
rmarkdown::render('Analysis.Rmd', output_file = 'test-render.html', output_format = 'html_document') rmarkdown::render('Analysis.Rmd', output_file = 'test-render', output_format = 'pdf_document')
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.