library(magrittr) library(ggplot2) library(parallel) library(Biobase) options(stringsAsFactors = FALSE) devtools::load_all('../') burd = colorRampPalette(colors = c("blue", "white", "red"))(n = 255) rdbu = colorRampPalette(colors = c("red", "white", "blue"))(n = 255) blues = colorRampPalette(colors = c('white', 'blue'))(n = 255) breakList = seq(-1,1,by = 0.01)
norms.simulation = readRDS('../data/simulations_normalized.RDS') METHODS_ORDERED = colnames(norms.simulation[[1]]$cdevMatrix)
Expression pattern of references
par(mfrow=c(1,8)) for (percentDE in names(norms.simulation)) { x = norms.simulation[[as.character(percentDE)]] if (is.null(x$refs.idx) | is.null(x$normResults)) next par(mar=c(1,1,1,1)) layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE), heights = c(1,8), widths = rep(1,8) ) # dev.off() plot.new() text(0.5, 0.5, sprintf("DE percentage: %s",percentDE)) for (processing in names(x$normResults)) { plot.matrix(log2(x$normResults[[processing]][x$refs.idx,]+1), asp = 5/3, main = processing) } }
Expression pattern of all genes
# par(mfrow=c(1,8)) for (percentDE in names(norms.simulation)) { x = norms.simulation[[as.character(percentDE)]] diff.idx = setdiff(1:nrow(x$normResults[[1]]), x$refs.idx) diff.idx.shuffle = sample(diff.idx,size =length(diff.idx), replace = FALSE) if (is.null(x$refs.idx) | is.null(x$normResults)) next par(mar=c(1,1,1,1)) layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE), heights = c(1,8), widths = rep(1,8) ) # dev.off() plot.new() text(0.5, 0.5, sprintf("DE percentage: %s",percentDE)) for (processing in names(x$normResults)) { plot.matrix(log2(x$normResults[[processing]][c(diff.idx.shuffle,x$refs.idx),]+1), asp = 5/3, main = processing) } }
breakList_cdev = seq(1,10,0.1) reds = colorRampPalette(c('white', 'red'))(n=length(breakList_cdev)) for (percentDE in names(norms.simulation)) { x = norms.simulation[[percentDE]] pheatmap::pheatmap(x$cdevMatrix, cellwidth = 30, cellheight = 30, display_numbers = TRUE, cluster_rows = FALSE, cluster_cols = FALSE, main = percentDE, color = reds, breaks = breakList_cdev # filename = sprintf('sim-percentDE_%s-cdevMatrix.png') ) }
percentDEs = as.numeric(names(norms.simulation)) p_cdev = sapply(names(norms.simulation), function(percentDE) { x = norms.simulation[[percentDE]] x$cdevMatrix['Ground-truth',] }) %>% reshape2::melt() %>% set_names(c('Method', 'percentDE', 'cdev')) %>% subset(.$Method != 'Ground-truth',TRUE) %>% ggplot(aes(x=percentDE,y=cdev,color=factor(Method, levels = METHODS_ORDERED))) + guides(color=guide_legend(title = 'Method')) + geom_line() + geom_point() + xlab('DE Fraction') + ggtitle('Simulation with heterogeneous FC') + theme_bw(base_size=13)
for (percentDE in names(norms.simulation)) { x = norms.simulation[[as.character(percentDE)]] x$fc_mse = lapply(names(x$normResults), function(norm_method) { mean(logFC(x$normResults[[norm_method]][x$refs.idx,],groups = c(rep(1,20), rep(2,20)), log.base = 2, pseudocount = 1)^2) }) %>% set_names(names(x$normResults)) norms.simulation[[percentDE]] = x } str(norms.simulation[['0.9']]) as.numeric(norms.simulation[['0.9']]$fc_mse)
sims_norms.de_results = lapply(names(norms.simulation), function(percentDE) { x = norms.simulation[[as.character(percentDE)]] x.cnt = x$normResults[['raw']] # Pick the true references with minimum counts > 0, # reverse-engineer the scaling factors an_eligible_ref = x$refs.idx[which(apply(x.cnt[x$refs.idx,], MARGIN = 1, min) > 0)][1] lapply(names(x$normResults), function(processing) { scalingFactors = x.cnt[an_eligible_ref,] / x$normResults[[processing]][an_eligible_ref,] de.test.limma(x.cnt, group = c(rep(1,20),rep(2,20)), norm.factors = scalingFactors) %>% limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH') }) %>% set_names(names(x$normResults)) }) %>% set_names(names(norms.simulation))
sims_norms.de_performance = lapply(names(norms.simulation), function(percentDE) { x = norms.simulation[[percentDE]] lapply(names(x$normResults), function(processing) { y = x$normResults[[processing]] de = sims_norms.de_results[[percentDE]][[processing]] deg.idx = which(de$adj.P.Val < 0.01) true_deg.idx = setdiff(1:nrow(y), x$refs.idx) is.diff = rep(TRUE, nrow(y)) is.diff[x$refs.idx] = FALSE auc = auroc(1-de$adj.P.Val, is.diff) TP = length(intersect(true_deg.idx, deg.idx)) FP = length(setdiff(deg.idx, true_deg.idx)) FN = length(setdiff(setdiff(1:nrow(y), deg.idx), x$refs.idx)) data.frame('Method' = processing, 'percentDE' = as.numeric(percentDE), 'cdev.vs.oracle' = x$cdevMatrix[processing,'Ground-truth'], 'fc_mse' = x$fc_mse[[processing]], 'AUC' = auc, 'TPR' = TP/length(true_deg.idx), 'FDR' = FP / length(deg.idx), 'FPR' = FP / length(x$refs.idx) # 'FNR' = 1 - TPR ) }) }) %>% unlist(recursive = FALSE) %>% do.call(rbind, .) sims_norms.de_performance$FNR = 1 - sims_norms.de_performance$TPR p_auc = ggplot(sims_norms.de_performance,aes(x=percentDE,y=AUC,color=factor(Method,levels=METHODS_ORDERED))) + geom_point() + geom_line() + xlim(c(0,1)) + ylim(c(0,1)) + guides(color=guide_legend(title = 'Method')) + xlab('DE percentage') + # ggtitle('Simulation with heterogeneous FC') + theme_bw(base_size=13) p_tpr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=TPR,color=factor(Method,levels=METHODS_ORDERED))) + geom_point() + geom_line() + xlim(c(0,1)) + ylim(c(0,1)) + guides(color=guide_legend(title = 'Method')) + xlab('DE percentage') + # ggtitle('Simulation with heterogeneous FC') + theme_bw(base_size=13) p_fdr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=FDR,color=factor(Method,levels=METHODS_ORDERED))) + geom_point() + geom_line() + xlim(c(0,1)) + ylim(c(0,1)) + guides(color=guide_legend(title = 'Method')) + xlab('DE percentage') + # ggtitle('Simulation with heterogeneous FC') + theme_bw(base_size=13) p_fpr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=FPR,color=factor(Method,levels=METHODS_ORDERED))) + geom_point() + geom_line() + xlim(c(0,1)) + ylim(c(0,1)) + guides(color=guide_legend(title = 'Method')) + xlab('DE percentage') + # ggtitle('Simulation with heterogeneous FC') + theme_bw(base_size=13) p_fnr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=FNR,color=factor(Method,levels=METHODS_ORDERED))) + geom_point() + geom_line() + xlim(c(0,1)) + ylim(c(0,1)) + guides(color=guide_legend(title = 'Method')) + xlab('DE percentage') + # ggtitle('Simulation with heterogeneous FC') + theme_bw(base_size=13) print(p_auc) print(p_tpr) print(p_fdr) print(p_fpr) print(p_fnr)
Combining plots for publication
p = egg::ggarrange(p_cdev + theme(axis.title.x = element_blank()), p_fpr + theme(axis.title.x = element_blank()), p_auc, ncol = 1) ggsave('comparison-sims-cdev_FPR_AUC.png', plot = p)
egg::ggarrange(p_fpr + theme(legend.position = 'none') , p_fnr, nrow = 1)
ggplot(sims_norms.de_performance) + facet_wrap('percentDE') + geom_point(aes(x=cdev.vs.oracle,y=AUC,color=factor(Method,levels=METHODS_ORDERED))) + guides(color=guide_legend(title = 'Method')) + theme_bw(base_size=13) p_cdev_vs_auc = ggplot(sims_norms.de_performance[sims_norms.de_performance$percentDE == 0.9,]) + geom_point(aes(x=cdev.vs.oracle,y=AUC,color=factor(Method,levels=METHODS_ORDERED)),size=5) + guides(color=guide_legend(title = 'Method')) + ggtitle('Simulation with heterogeneous FC,\npercentDE = 0.9') + theme_bw(base_size=13) + coord_fixed(ratio = 4) + ylim(c(0,1)) + xlab('cdev(X,A)') print(p_cdev_vs_auc) ggsave('comparison-sims-cdev_vs_AUC-percentDE0.9.png', plot = p_cdev_vs_auc)
percentDEs = as.numeric(names(norms.simulation)) p_mse = lapply(names(norms.simulation), function(percentDE) { x = norms.simulation[[percentDE]] data.frame('percentDE' = percentDE, 'Method' = names(x$fc_mse), 'fc_mse' = as.numeric(x$fc_mse)) }) %>% do.call(rbind, .) %>% ggplot(aes(x=percentDE,y=fc_mse,group=Method,color=factor(Method, levels = METHODS_ORDERED))) + guides(color=guide_legend(title = 'Method')) + geom_line() + geom_point() + ggtitle('Simulation with heterogeneous FC') + xlab('DE Fraction') + ylab('Fold-change MSE') + theme_bw(base_size=13) print(p_mse)
ggplot(sims_norms.de_performance) + facet_wrap('percentDE') + geom_point(aes(x=fc_mse,y=AUC,color=factor(Method,levels=METHODS_ORDERED))) + guides(color=guide_legend(title = 'Method')) + theme_bw(base_size=13) p_mse_vs_auc = ggplot(sims_norms.de_performance[sims_norms.de_performance$percentDE == 0.9,]) + geom_point(aes(x=fc_mse,y=AUC,color=factor(Method,levels=METHODS_ORDERED)),size=5) + guides(color=guide_legend(title = 'Method')) + ggtitle('Simulation with heterogeneous FC,\npercentDE = 0.9') + theme_bw(base_size=13) + coord_fixed(ratio = 2) + ylim(c(0,1)) + xlab('Fold-change MSE') print(p_mse_vs_auc) ggsave('comparison-sims-fcmse_vs_AUC-percentDE0.9.png', plot = p_mse_vs_auc)
Juxtaposition of plots to compare cdev and MSE
p_cdev_mse = egg::ggarrange(p_mse + theme(plot.title = element_blank(), legend.position = "none"), p_cdev + theme(plot.title = element_blank()), p_mse_vs_auc + theme(plot.title = element_blank(), legend.position = "none"), p_cdev_vs_auc + theme(plot.title = element_blank()), ncol = 2, byrow = TRUE) ggsave('comparison-sims-mse+cdev.png', plot = p_cdev_mse)
env.norms = readRDS('../data/real_data_normalized.RDS')
load('../data/data_container.Rdata') DATASET_PREFIXES = c('rbm1', 'rbm2', 'rnor1', 'rnor2', 'mpreg1a', 'mpreg1b', 'mpreg2a', 'mpreg2b', 'dmelAC', 'dmelAV','sarcoma', 'lymphoma', 'yfv', 'xtro1m', 'xtro2')
Expression pattern of ERCC spike-ins
# par(mfrow=c(1,7)) for (prefix in DATASET_PREFIXES) { REFS.idx = get0(paste0(prefix,'.refs.idx'), envir = data.container) x.normResults = get0(paste0(prefix,'.normResults'), envir = env.norms) if (is.null(REFS.idx) | is.null(x.normResults)) next png(filename = sprintf("%s_matrices.png",prefix),width =1400,height = 400) par(mar=c(1,1,1,1)) layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE), heights = c(1,8), widths = rep(1,8) ) plot.new() text(0.5, 0.5, prefix) for (processing in names(x.normResults)) { plot.matrix(log2(x.normResults[[processing]][REFS.idx,]+1), asp = 5/3, main = processing) } dev.off() }
Expression pattern of ERCC spike-ins and random set of internal genes
# par(mfrow=c(1,7)) for (prefix in DATASET_PREFIXES) { REFS.idx = get0(paste0(prefix,'.refs.idx'), envir = data.container) x.normResults = get0(paste0(prefix,'.normResults'), envir = env.norms) rand.idx = sample(setdiff(1:nrow(x.normResults[[1]]), REFS.idx), size = length(REFS.idx)) if (is.null(REFS.idx) | is.null(x.normResults)) next par(mar=c(1,1,1,1)) layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE), heights = c(1,8), widths = rep(1,8) ) # dev.off() plot.new() text(0.5, 0.5, prefix) for (processing in names(x.normResults)) { plot.matrix(log2(x.normResults[[processing]][c(rand.idx,REFS.idx),]+1), asp = 5/3, main = processing) } }
sample_grouping = list( 'rbm1' = 'Organ', 'rbm2' = 'Organ', 'rnor1' = 'Chemical', 'rnor2' = 'Chemical', 'mpreg1a' = 'BrainRegion', 'mpreg1b' = 'Stage', 'mpreg2a' = 'BrainRegion', 'mpreg2b' = 'Stage', 'dmelAC' = c('Environment', 'FlowcellId'), 'dmelAV' = 'Environment', 'sarcoma' = c('Tissue', 'Sex', 'Metastasis'), 'lymphoma' = c('CellLine', 'Treatment'), 'yfv' = 'Infection' # 'xtro1m' = 'Time.hpf.', # Time-series data need to be visualized differently # 'xtro2' = 'Time.hpf.', ) for (data_prefix in names(sample_grouping)) { selectedPheno = sample_grouping[[data_prefix]] x.pheno = get(paste0(data_prefix, '.pheno'), envir = data.container) x.cnt = get(paste0(data_prefix, '.cnt'), envir = data.container) x.normed = get(paste0(data_prefix, '.normed'), envir = data.container) x.normResults = get(paste0(data_prefix, '.normResults'), envir = env.norms) x.refs.idx = get(paste0(data_prefix, '.refs.idx'), envir = data.container) x.pca.df = list( project.pca(t(x.cnt), selected.pc = c(1,2)) %>% cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'raw', 'Subset' = 'all'), project.pca(t(x.normed), selected.pc = c(1,2)) %>% cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'normalized', 'Subset' = 'all'), project.pca(t(x.normResults[['PoissonSeq']]), selected.pc = c(1,2)) %>% cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'PoissonSeq', 'Subset' = 'all'), project.pca(t(x.cnt[x.refs.idx,]), selected.pc = c(1,2)) %>% cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'raw', 'Subset' = 'ERCC spike-ins'), project.pca(t(x.normed[x.refs.idx,]), selected.pc = c(1,2)) %>% cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'normalized', 'Subset' = 'ERCC spike-ins'), project.pca(t(x.normResults[['PoissonSeq']][x.refs.idx,]), selected.pc = c(1,2)) %>% cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'PoissonSeq', 'Subset' = 'ERCC spike-ins') ) %>% do.call(rbind, .) for (field in selectedPheno) { p = ggplot(x.pca.df) + facet_grid(Subset ~ factor(Processing,levels=c('raw', 'normalized')) ) + geom_point(aes_string(x='PC1',y='PC2',color=field)) + ggtitle(data_prefix) print(p) ggsave(filename = sprintf('pca-raw_vs_norm-%s-%s.png', data_prefix, field), plot = p) } }
breakList_cdev = seq(1,50,0.2) reds = colorRampPalette(c('white', 'red'))(n=length(breakList_cdev)) for (prefix in DATASET_PREFIXES) { x.cdevMatrix = get(paste0(prefix, '.cdevMatrix'), envir = env.norms) pheatmap::pheatmap(x.cdevMatrix, cellwidth = 30, cellheight = 30, display_numbers = TRUE, main = prefix,cluster_rows = FALSE, cluster_cols = FALSE, color = reds, breaks = breakList_cdev, filename = paste0(prefix, '_cdevMatrix.png')) }
time_points = unique(xtro1m.pheno$Time.hpf.) xtro1m.cnt.pca = project.pca(t(xtro1m.cnt), selected.pc = c(1,2,3)) %>% cbind(xtro1m.pheno[,c('Time.hpf.'), drop=FALSE]) xtro1m.normed.pca = project.pca(t(xtro1m.normed), selected.pc = c(1,2,3)) %>% cbind(xtro1m.pheno[,c('Time.hpf.'), drop=FALSE]) ggplot(xtro1m.cnt.pca) + geom_point(aes(x=PC1,y=PC2,color=Time.hpf.),size=6, alpha=0.4) + scale_color_gradientn(colours = rainbow(length(time_points))) + geom_text(aes(x=PC1,y=PC2,label=Time.hpf.),size=3) + theme(legend.position = "none") ggplot(xtro1m.normed.pca) + geom_point(aes(x=PC1,y=PC2,color=Time.hpf.),size=6, alpha=0.4) + scale_color_gradientn(colours = rainbow(length(time_points))) + geom_text(aes(x=PC1,y=PC2,label=Time.hpf.),size=3) + theme(legend.position = "none")
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.