In ttdtrang/cdev-paper: cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

library(magrittr)
library(ggplot2)
library(parallel)
library(Biobase)
options(stringsAsFactors = FALSE)
devtools::load_all('../')
burd = colorRampPalette(colors = c("blue", "white", "red"))(n = 255)
rdbu = colorRampPalette(colors = c("red", "white", "blue"))(n = 255)
blues = colorRampPalette(colors = c('white', 'blue'))(n = 255)
breakList = seq(-1,1,by = 0.01)

Comparison on simulation

norms.simulation = readRDS('../data/simulations_normalized.RDS')
METHODS_ORDERED = colnames(norms.simulation[[1]]$cdevMatrix)

Expression matrices

Expression pattern of references

par(mfrow=c(1,8))
for (percentDE in names(norms.simulation)) {
    x = norms.simulation[[as.character(percentDE)]] 

    if (is.null(x$refs.idx) | is.null(x$normResults)) next
    par(mar=c(1,1,1,1))
    layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE),
           heights = c(1,8),
           widths = rep(1,8)
           )
    # dev.off()
    plot.new()
    text(0.5, 0.5, sprintf("DE percentage: %s",percentDE))
    for (processing in names(x$normResults)) {
        plot.matrix(log2(x$normResults[[processing]][x$refs.idx,]+1), asp = 5/3, main = processing)
    }
}

Expression pattern of all genes

# par(mfrow=c(1,8))
for (percentDE in names(norms.simulation)) {
    x = norms.simulation[[as.character(percentDE)]] 
    diff.idx = setdiff(1:nrow(x$normResults[[1]]), x$refs.idx)
    diff.idx.shuffle = sample(diff.idx,size =length(diff.idx), replace = FALSE)
    if (is.null(x$refs.idx) | is.null(x$normResults)) next
    par(mar=c(1,1,1,1))
    layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE),
           heights = c(1,8),
           widths = rep(1,8)
           )
    # dev.off()
    plot.new()
    text(0.5, 0.5, sprintf("DE percentage: %s",percentDE))
    for (processing in names(x$normResults)) {
        plot.matrix(log2(x$normResults[[processing]][c(diff.idx.shuffle,x$refs.idx),]+1), asp = 5/3, main = processing)
    }
}

cdev matrix

breakList_cdev = seq(1,10,0.1)
reds = colorRampPalette(c('white', 'red'))(n=length(breakList_cdev))
for (percentDE in names(norms.simulation)) {
    x = norms.simulation[[percentDE]]
    pheatmap::pheatmap(x$cdevMatrix,
                       cellwidth = 30, cellheight = 30,
                       display_numbers = TRUE, cluster_rows = FALSE, cluster_cols = FALSE,
                       main = percentDE, color = reds, breaks = breakList_cdev
                       # filename = sprintf('sim-percentDE_%s-cdevMatrix.png')
                       )
}

cdev by DE percentage

percentDEs = as.numeric(names(norms.simulation))
p_cdev = sapply(names(norms.simulation), function(percentDE) {
    x = norms.simulation[[percentDE]]
    x$cdevMatrix['Ground-truth',]
}) %>%
    reshape2::melt() %>%
    set_names(c('Method', 'percentDE', 'cdev')) %>%
    subset(.$Method != 'Ground-truth',TRUE) %>%
    ggplot(aes(x=percentDE,y=cdev,color=factor(Method, levels = METHODS_ORDERED))) +
    guides(color=guide_legend(title = 'Method')) +
    geom_line() +
    geom_point() +
    xlab('DE Fraction') +
    ggtitle('Simulation with heterogeneous FC') +
    theme_bw(base_size=13)

Fold-change MSE by DE percentage

for (percentDE in names(norms.simulation)) {
    x = norms.simulation[[as.character(percentDE)]]
    x$fc_mse = lapply(names(x$normResults), function(norm_method) {
        mean(logFC(x$normResults[[norm_method]][x$refs.idx,],groups = c(rep(1,20), rep(2,20)), log.base = 2, pseudocount = 1)^2)
    }) %>%
        set_names(names(x$normResults))
    norms.simulation[[percentDE]] = x
}
str(norms.simulation[['0.9']])
as.numeric(norms.simulation[['0.9']]$fc_mse)

DE performance

sims_norms.de_results = lapply(names(norms.simulation), function(percentDE) {
    x = norms.simulation[[as.character(percentDE)]]
    x.cnt = x$normResults[['raw']]

    # Pick the true references with minimum counts > 0,
    # reverse-engineer the scaling factors
    an_eligible_ref = x$refs.idx[which(apply(x.cnt[x$refs.idx,], MARGIN = 1, min) > 0)][1]
    lapply(names(x$normResults), function(processing) {
        scalingFactors = x.cnt[an_eligible_ref,] / x$normResults[[processing]][an_eligible_ref,]
        de.test.limma(x.cnt,
                group = c(rep(1,20),rep(2,20)),
                norm.factors = scalingFactors) %>%
            limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH')    
    }) %>%
        set_names(names(x$normResults))

}) %>%
    set_names(names(norms.simulation))

sims_norms.de_performance =
    lapply(names(norms.simulation), function(percentDE) {
        x = norms.simulation[[percentDE]]
        lapply(names(x$normResults), function(processing) {
            y = x$normResults[[processing]]

            de = sims_norms.de_results[[percentDE]][[processing]]
            deg.idx = which(de$adj.P.Val < 0.01)
            true_deg.idx = setdiff(1:nrow(y), x$refs.idx)
            is.diff = rep(TRUE, nrow(y))
            is.diff[x$refs.idx] = FALSE
            auc = auroc(1-de$adj.P.Val, is.diff)
            TP = length(intersect(true_deg.idx, deg.idx))
            FP = length(setdiff(deg.idx, true_deg.idx))
            FN = length(setdiff(setdiff(1:nrow(y), deg.idx), x$refs.idx))
            data.frame('Method' = processing,
                       'percentDE' = as.numeric(percentDE),
                       'cdev.vs.oracle' = x$cdevMatrix[processing,'Ground-truth'],
                      'fc_mse' = x$fc_mse[[processing]],
                       'AUC' = auc,
                       'TPR' = TP/length(true_deg.idx),
                       'FDR' = FP / length(deg.idx),
                       'FPR' = FP / length(x$refs.idx)
                       # 'FNR' = 1 - TPR 
                       )
        })
    }) %>%
        unlist(recursive = FALSE) %>%
        do.call(rbind, .)

sims_norms.de_performance$FNR = 1 - sims_norms.de_performance$TPR
p_auc = ggplot(sims_norms.de_performance,aes(x=percentDE,y=AUC,color=factor(Method,levels=METHODS_ORDERED))) +
    geom_point() +
    geom_line() +
    xlim(c(0,1)) +
    ylim(c(0,1)) +
    guides(color=guide_legend(title = 'Method')) +
    xlab('DE percentage') +
    # ggtitle('Simulation with heterogeneous FC') +
    theme_bw(base_size=13)

p_tpr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=TPR,color=factor(Method,levels=METHODS_ORDERED))) +
    geom_point() +
    geom_line() +
    xlim(c(0,1)) +
    ylim(c(0,1)) +
    guides(color=guide_legend(title = 'Method')) +
    xlab('DE percentage') +
    # ggtitle('Simulation with heterogeneous FC') +
    theme_bw(base_size=13)


p_fdr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=FDR,color=factor(Method,levels=METHODS_ORDERED))) +
    geom_point() +
    geom_line() +
    xlim(c(0,1)) +
    ylim(c(0,1)) +
    guides(color=guide_legend(title = 'Method')) +
    xlab('DE percentage') +
    # ggtitle('Simulation with heterogeneous FC') +
    theme_bw(base_size=13)

p_fpr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=FPR,color=factor(Method,levels=METHODS_ORDERED))) +
    geom_point() +
    geom_line() +
    xlim(c(0,1)) +
    ylim(c(0,1)) +
    guides(color=guide_legend(title = 'Method')) +
    xlab('DE percentage') +
    # ggtitle('Simulation with heterogeneous FC') +
    theme_bw(base_size=13)

p_fnr = ggplot(sims_norms.de_performance,aes(x=percentDE,y=FNR,color=factor(Method,levels=METHODS_ORDERED))) +
    geom_point() +
    geom_line() +
    xlim(c(0,1)) +
    ylim(c(0,1)) +
    guides(color=guide_legend(title = 'Method')) +
    xlab('DE percentage') +
    # ggtitle('Simulation with heterogeneous FC') +
    theme_bw(base_size=13)
print(p_auc)
print(p_tpr)
print(p_fdr)
print(p_fpr)
print(p_fnr)

Combining plots for publication

p = egg::ggarrange(p_cdev + theme(axis.title.x = element_blank()),
               p_fpr + theme(axis.title.x = element_blank()),
               p_auc, ncol = 1)
ggsave('comparison-sims-cdev_FPR_AUC.png', plot = p)

egg::ggarrange(p_fpr + theme(legend.position = 'none') ,
               p_fnr, nrow = 1)

cdev vs AUC

ggplot(sims_norms.de_performance) +
    facet_wrap('percentDE') +
    geom_point(aes(x=cdev.vs.oracle,y=AUC,color=factor(Method,levels=METHODS_ORDERED))) + 
    guides(color=guide_legend(title = 'Method'))  +
    theme_bw(base_size=13)

p_cdev_vs_auc = ggplot(sims_norms.de_performance[sims_norms.de_performance$percentDE == 0.9,]) +
    geom_point(aes(x=cdev.vs.oracle,y=AUC,color=factor(Method,levels=METHODS_ORDERED)),size=5) + 
    guides(color=guide_legend(title = 'Method')) +
    ggtitle('Simulation with heterogeneous FC,\npercentDE = 0.9') +
    theme_bw(base_size=13) +
    coord_fixed(ratio = 4) +
    ylim(c(0,1)) +
    xlab('cdev(X,A)')
print(p_cdev_vs_auc)
ggsave('comparison-sims-cdev_vs_AUC-percentDE0.9.png', plot = p_cdev_vs_auc)

Equivalent use of fold-change MSE

Fold-change MSE of various methods

percentDEs = as.numeric(names(norms.simulation))
p_mse = lapply(names(norms.simulation), function(percentDE) {
    x = norms.simulation[[percentDE]]
    data.frame('percentDE' = percentDE,
               'Method' = names(x$fc_mse),
               'fc_mse' = as.numeric(x$fc_mse))
}) %>%
    do.call(rbind, .) %>%
    ggplot(aes(x=percentDE,y=fc_mse,group=Method,color=factor(Method, levels = METHODS_ORDERED))) +
    guides(color=guide_legend(title = 'Method')) +
    geom_line() +
    geom_point() +
    ggtitle('Simulation with heterogeneous FC') +
    xlab('DE Fraction') +
    ylab('Fold-change MSE') +
    theme_bw(base_size=13)
print(p_mse)

Fold-change MSE vs AUC

ggplot(sims_norms.de_performance) +
    facet_wrap('percentDE') +
    geom_point(aes(x=fc_mse,y=AUC,color=factor(Method,levels=METHODS_ORDERED))) + 
    guides(color=guide_legend(title = 'Method'))  +
    theme_bw(base_size=13)

p_mse_vs_auc = ggplot(sims_norms.de_performance[sims_norms.de_performance$percentDE == 0.9,]) +
    geom_point(aes(x=fc_mse,y=AUC,color=factor(Method,levels=METHODS_ORDERED)),size=5) + 
    guides(color=guide_legend(title = 'Method')) +
    ggtitle('Simulation with heterogeneous FC,\npercentDE = 0.9') +
    theme_bw(base_size=13) +
    coord_fixed(ratio = 2) +
    ylim(c(0,1)) +
    xlab('Fold-change MSE')
print(p_mse_vs_auc)
ggsave('comparison-sims-fcmse_vs_AUC-percentDE0.9.png', plot = p_mse_vs_auc)

Juxtaposition of plots to compare cdev and MSE

p_cdev_mse = egg::ggarrange(p_mse + theme(plot.title = element_blank(), legend.position = "none"),
                            p_cdev + theme(plot.title = element_blank()),
                            p_mse_vs_auc + theme(plot.title = element_blank(), legend.position = "none"),
                            p_cdev_vs_auc + theme(plot.title = element_blank()),
                            ncol = 2, byrow = TRUE)
ggsave('comparison-sims-mse+cdev.png', plot = p_cdev_mse)

Comparison on real data

Expression matrices

env.norms = readRDS('../data/real_data_normalized.RDS')

load('../data/data_container.Rdata')
DATASET_PREFIXES = c('rbm1', 'rbm2', 'rnor1', 'rnor2',
                     'mpreg1a', 'mpreg1b', 'mpreg2a', 'mpreg2b',
                     'dmelAC', 'dmelAV','sarcoma', 'lymphoma',
                     'yfv', 'xtro1m', 'xtro2')

Expression pattern of ERCC spike-ins

# par(mfrow=c(1,7))
for (prefix in DATASET_PREFIXES) {
    REFS.idx = get0(paste0(prefix,'.refs.idx'), envir = data.container)
    x.normResults = get0(paste0(prefix,'.normResults'), envir = env.norms)
    if (is.null(REFS.idx) | is.null(x.normResults)) next
    png(filename = sprintf("%s_matrices.png",prefix),width =1400,height = 400)
    par(mar=c(1,1,1,1))
    layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE),
           heights = c(1,8),
           widths = rep(1,8)
           )
    plot.new()
    text(0.5, 0.5, prefix)
    for (processing in names(x.normResults)) {
        plot.matrix(log2(x.normResults[[processing]][REFS.idx,]+1), asp = 5/3, main = processing)
    }
    dev.off()
}

Expression pattern of ERCC spike-ins and random set of internal genes

# par(mfrow=c(1,7))
for (prefix in DATASET_PREFIXES) {
    REFS.idx = get0(paste0(prefix,'.refs.idx'), envir = data.container)
    x.normResults = get0(paste0(prefix,'.normResults'), envir = env.norms)
    rand.idx = sample(setdiff(1:nrow(x.normResults[[1]]), REFS.idx), size = length(REFS.idx))
    if (is.null(REFS.idx) | is.null(x.normResults)) next
    par(mar=c(1,1,1,1))
    layout(matrix(c(rep(1,8), c(2:9)), nrow=2, byrow = TRUE),
           heights = c(1,8),
           widths = rep(1,8)
           )
    # dev.off()
    plot.new()
    text(0.5, 0.5, prefix)
    for (processing in names(x.normResults)) {
        plot.matrix(log2(x.normResults[[processing]][c(rand.idx,REFS.idx),]+1), asp = 5/3, main = processing)
    }
}

PCA

sample_grouping = list(
    'rbm1' = 'Organ',
    'rbm2' = 'Organ',
    'rnor1' = 'Chemical',
    'rnor2' = 'Chemical',
    'mpreg1a' = 'BrainRegion',
    'mpreg1b' = 'Stage',
    'mpreg2a' = 'BrainRegion',
    'mpreg2b' = 'Stage',
    'dmelAC' = c('Environment', 'FlowcellId'),
    'dmelAV' = 'Environment',
    'sarcoma' = c('Tissue', 'Sex', 'Metastasis'),
    'lymphoma' = c('CellLine', 'Treatment'),
    'yfv' = 'Infection'
    # 'xtro1m' = 'Time.hpf.', # Time-series data need to be visualized differently
    # 'xtro2' = 'Time.hpf.',

)
for (data_prefix in names(sample_grouping)) {
    selectedPheno = sample_grouping[[data_prefix]]
    x.pheno = get(paste0(data_prefix, '.pheno'), envir = data.container)
    x.cnt = get(paste0(data_prefix, '.cnt'), envir = data.container)
    x.normed  = get(paste0(data_prefix, '.normed'), envir = data.container)
    x.normResults = get(paste0(data_prefix, '.normResults'), envir = env.norms)
    x.refs.idx = get(paste0(data_prefix, '.refs.idx'), envir = data.container)
    x.pca.df = list(
        project.pca(t(x.cnt), selected.pc = c(1,2)) %>%
             cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'raw', 'Subset' = 'all'),
        project.pca(t(x.normed), selected.pc = c(1,2)) %>%
             cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'normalized', 'Subset' = 'all'),
        project.pca(t(x.normResults[['PoissonSeq']]), selected.pc = c(1,2)) %>%
             cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'PoissonSeq', 'Subset' = 'all'),
        project.pca(t(x.cnt[x.refs.idx,]), selected.pc = c(1,2)) %>%
             cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'raw', 'Subset' = 'ERCC spike-ins'),
        project.pca(t(x.normed[x.refs.idx,]), selected.pc = c(1,2)) %>%
             cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'normalized', 'Subset' = 'ERCC spike-ins'),
        project.pca(t(x.normResults[['PoissonSeq']][x.refs.idx,]), selected.pc = c(1,2)) %>%
             cbind(x.pheno[,selectedPheno, drop=FALSE], 'Processing' = 'PoissonSeq', 'Subset' = 'ERCC spike-ins')
    ) %>%
        do.call(rbind, .)

for (field in selectedPheno) {
        p = ggplot(x.pca.df) +
            facet_grid(Subset ~ factor(Processing,levels=c('raw', 'normalized')) ) +
            geom_point(aes_string(x='PC1',y='PC2',color=field)) +
            ggtitle(data_prefix)
        print(p)
        ggsave(filename = sprintf('pca-raw_vs_norm-%s-%s.png', data_prefix, field), plot = p)
    }
}

Summarized results

breakList_cdev = seq(1,50,0.2)
reds = colorRampPalette(c('white', 'red'))(n=length(breakList_cdev))
for (prefix in DATASET_PREFIXES) {
    x.cdevMatrix = get(paste0(prefix, '.cdevMatrix'), envir = env.norms)
    pheatmap::pheatmap(x.cdevMatrix,
                       cellwidth = 30, cellheight = 30, display_numbers = TRUE,
                       main = prefix,cluster_rows = FALSE, cluster_cols = FALSE, color = reds, breaks = breakList_cdev,
                       filename = paste0(prefix, '_cdevMatrix.png'))
}

time_points = unique(xtro1m.pheno$Time.hpf.)
xtro1m.cnt.pca = project.pca(t(xtro1m.cnt), selected.pc = c(1,2,3)) %>%
    cbind(xtro1m.pheno[,c('Time.hpf.'), drop=FALSE])
xtro1m.normed.pca = project.pca(t(xtro1m.normed), selected.pc = c(1,2,3)) %>%
    cbind(xtro1m.pheno[,c('Time.hpf.'), drop=FALSE])
ggplot(xtro1m.cnt.pca) +
    geom_point(aes(x=PC1,y=PC2,color=Time.hpf.),size=6, alpha=0.4) +
    scale_color_gradientn(colours = rainbow(length(time_points))) +
    geom_text(aes(x=PC1,y=PC2,label=Time.hpf.),size=3) +
    theme(legend.position = "none")

ggplot(xtro1m.normed.pca) +
    geom_point(aes(x=PC1,y=PC2,color=Time.hpf.),size=6, alpha=0.4) +
    scale_color_gradientn(colours = rainbow(length(time_points))) +
    geom_text(aes(x=PC1,y=PC2,label=Time.hpf.),size=3) +
    theme(legend.position = "none")

ttdtrang/cdev-paper documentation built on Dec. 23, 2021, 1:01 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ttdtrang/cdev-paper
cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

In ttdtrang/cdev-paper: cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

Comparison on simulation

Expression matrices

cdev matrix

cdev by DE percentage

Fold-change MSE by DE percentage

DE performance

cdev vs AUC

Equivalent use of fold-change MSE

Fold-change MSE of various methods

Fold-change MSE vs AUC

Comparison on real data

Expression matrices

PCA

Summarized results

R Package Documentation

Browse R Packages

We want your feedback!

ttdtrang/cdev-paper cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

In ttdtrang/cdev-paper: cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

Comparison on simulation

Expression matrices

cdev matrix

cdev by DE percentage

Fold-change MSE by DE percentage

DE performance

cdev vs AUC

Equivalent use of fold-change MSE

Fold-change MSE of various methods

Fold-change MSE vs AUC

Comparison on real data

Expression matrices

PCA

Summarized results

R Package Documentation

Browse R Packages

We want your feedback!

ttdtrang/cdev-paper
cdev: a ground-truth based measure to evaluate RNA-seq normalization methods