In ttdtrang/cdev-paper: cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

How does the value of cdev depend on different characteristics of the data set, such as the level of inter-group variation, the number of genes, the percentage of DE genes? We'll use simulation to explore those relationships.

library(magrittr)
library(ggplot2)
library(parallel)
devtools::load_all('../')

logFC = function(x, groups, log.base = 2, pseudocount = 0) {
    G = unique(groups)
    log(apply(x[,groups == G[2]], MARGIN = 1, mean) + pseudocount, base = log.base) - 
        log(apply(x[,groups == G[1]], MARGIN = 1, mean) + pseudocount, base = log.base)
}
THEME_BASE_SIZE = 13

Homogeneous fold-change

Varying number of genes

NSAMPLES = 40
nGenes = c(2000,10000, 20000, 50000)
percentDE = 0.5 
foldChange = 4
sims_g <- lapply(nGenes, function(nGene) {
    simulate.counts(nGene, NSAMPLES/2,NSAMPLES/2,fc = foldChange, percentDE = percentDE, completelyAsymm = TRUE)
}) %>%
    set_names(as.character(nGenes))

sims_g.ref = lapply(nGenes, function(nGene) {
    sim = sims_g[[as.character(nGene)]]
    normalize.by.refs(sim$counts, which(sim$differential == 0))
}) %>%
    set_names(as.character(nGenes))

n_data_points = 30

sims_g.rand.norms =
    lapply(nGenes, function(nGene) {
        sim = sims_g[[as.character(nGene)]]
        sim.norm.ref = sims_g.ref[[as.character(nGene)]]
        trueRef.idx = which(sim$differential == 0)
        trueDif.idx = which(sim$differential == 1)
        NSCALE = min(length(trueDif.idx), length(trueRef.idx))
        mclapply(seq(1,NSCALE, floor(NSCALE/n_data_points)), mc.cores = 1, FUN = function(n_diff) {
            lapply(1:10, function(i) {
                rand.dif = sample(trueDif.idx, size = n_diff)
                rand.ref = sample(trueRef.idx, size = NSCALE - n_diff)
                rand.subset = union(rand.ref, rand.dif)
                raw = colSums(sim$counts[rand.subset,,drop= FALSE])
                sim.norm.i = normalize.by.refs(sim$counts, rand.subset)
                scalingFactors = raw / geom.mean(raw)
                mse_ref = mean((logFC(sim.norm.i[trueRef.idx,], sim$conditions$condition, log.base = 2, pseudocount = 1))^2)
                return(list('scalingFactors' = scalingFactors,
                            'nGene' = nGene,
                            'rand.subset' = rand.subset,
                            'diffFraction' = n_diff/NSCALE,
                            'n_diffs_in_scale' = n_diff,
                            'diffVarTotal' = sum(var(sim$counts[rand.dif,])), # The total variance of the DE genes in the random set selected for scaling
                            'cdev' =  cdev(sim.norm.i, sim.norm.ref),
                            'mse' = mse_ref))
            }) 
        }) %>%
        unlist(recursive = FALSE)
    }) %>%
    unlist(recursive = FALSE)

DE tests

g_de.results = lapply(sims_g.rand.norms, function(sim.norm) {

    sim = sims_g[[as.character(sim.norm$nGene)]]
    de.test.limma(sim$counts,
            group = sim$conditions$condition,
            norm.factors = sim.norm$scalingFactors) %>%
        limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH')
})

performance.df = lapply(1:length(sims_g.rand.norms), function(i) {

    sim.norm = sims_g.rand.norms[[i]]
    sim = sims_g[[as.character(sim.norm$nGene)]]
    de = g_de.results[[i]]
    deg.idx = which(de$adj.P.Val < 0.01)
    auc = auroc(1-de$adj.P.Val, !!sim$differential)
    TP = length(intersect(which(sim$differential == 1), deg.idx))
    FP = length(setdiff(deg.idx, which(sim$differential == 1)))
    idx.diff = sim.norm$rand.subset[which(sim$differential[sim.norm$rand.subset] == 1)]

    data.frame('cdev' = sim.norm$cdev,
               'mse' = sim.norm$mse,
               'nGene' = sim.norm$nGene,
               'diffFraction' = sim.norm$diffFraction,
               'n_diffs_in_scale' = sim.norm$n_diffs_in_scale,
               'diffVarTotal' = sim.norm$diffVarTotal,
               'AUC' = auc,
               'TPR' = TP/length(which(sim$differential == 1)),
               'FDR' = FP / length(deg.idx),
               'FPR' = FP / length(which(sim$differential == 0))
               )
}) %>%
    do.call(rbind, .)

ggplot(performance.df) +
    facet_grid(. ~ nGene) +
    geom_point(aes(x=AUC,y=cdev, color=nGene, group=nGene), alpha = 0.3)
ggplot(performance.df) +
    facet_grid(. ~ nGene) +
    geom_point(aes(x=cdev,y=AUC, color=nGene, group=nGene), alpha = 0.3)
# ggplot(performance.df) +
#     geom_point(aes(x=diffVarTotal,y=cdev, color=(nGene), group=nGene), alpha = 0.3)

Relating to normalization quality

cdev vs reference set quality

pl_cdev_vs_df.ngene = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(pl_cdev_vs_df.ngene)

cdev vs DE performance

pl_cdev_vs_auc.ngene = ggplot(performance.df) +
    geom_point(aes(x=cdev,y=AUC, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(pl_cdev_vs_auc.ngene)

MSE vs reference set quality

pl_mse_vs_df.ngene = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=mse, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(pl_mse_vs_df.ngene)

MSE vs DE performance

pl_mse_vs_auc.ngene = ggplot(performance.df) +
    geom_point(aes(x=mse,y=AUC, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(pl_mse_vs_auc.ngene)

Varying fold-change

NSAMPLES = 40
nGene = 20000
percentDE = 0.5 
foldChanges= c(1.5, 2, 3, 4, 5, 6, 7, 8) 
sims_f <- lapply(foldChanges, function(foldChange) {
    simulate.counts(nGene, NSAMPLES/2,NSAMPLES/2,fc = foldChange, percentDE = percentDE, completelyAsymm = TRUE)
}) %>%
    set_names(as.character(foldChanges))

sims_f.ref = lapply(foldChanges, function(foldChange) {
    sim = sims_f[[as.character(foldChange)]]
    normalize.by.refs(sim$counts, which(sim$differential == 0))
}) %>%
    set_names(as.character(foldChanges))

sims_f.rand.norms =
    lapply(foldChanges, function(foldChange) {
        sim = sims_f[[as.character(foldChange)]]
        sim.norm.ref = sims_f.ref[[as.character(foldChange)]]
        trueRef.idx = which(sim$differential == 0)
        trueDif.idx = which(sim$differential == 1)
        NSCALE = min(length(trueDif.idx), length(trueRef.idx))
        mclapply(seq(1,NSCALE, floor(NSCALE/n_data_points)), mc.cores = 1, FUN = function(n_diff) {
            lapply(1:10, function(i) {
                rand.dif = sample(trueDif.idx, size = n_diff)
                rand.ref = sample(trueRef.idx, size = NSCALE - n_diff)
                rand.subset = union(rand.ref, rand.dif)
                raw = colSums(sim$counts[rand.subset,,drop= FALSE])
                sim.norm.i = normalize.by.refs(sim$counts, rand.subset)
                scalingFactors = raw / geom.mean(raw)

                mseRefs = mean((logFC(sim.norm.i[trueRef.idx,], sim$conditions$condition, log.base = 2, pseudocount = 1))^2) 
                return(list('scalingFactors' = scalingFactors,
                            'foldChange' = foldChange,
                            'rand.subset' = rand.subset,
                            'diffFraction' = n_diff/NSCALE,
                            'n_diffs_in_scale' = n_diff,
                            'diffVarTotal' = sum(var(sim$counts[rand.dif,])), # The total variance of the DE genes in the random set selected for scaling
                            'cdev' =  cdev(sim.norm.i, sim.norm.ref),
                            'mseRefs' = mseRefs))
            }) 
        }) %>%
        unlist(recursive = FALSE)
    }) %>%
    unlist(recursive = FALSE)

DE tests

f_de.results = lapply(sims_f.rand.norms, function(sim.norm) {

    sim = sims_f[[as.character(sim.norm$foldChange)]]
    de.test.limma(sim$counts,
            group = sim$conditions$condition,
            norm.factors = sim.norm$scalingFactors) %>%
        limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH')
})

Relating to normalization quality

performance.df = lapply(1:length(sims_f.rand.norms), function(i) {

    sim.norm = sims_f.rand.norms[[i]]
    sim = sims_f[[as.character(sim.norm$foldChange)]]
    de = f_de.results[[i]]
    deg.idx = which(de$adj.P.Val < 0.01)

    auc = auroc(1-de$adj.P.Val, !!sim$differential)
    TP = length(intersect(which(sim$differential == 1), deg.idx))
    FP = length(setdiff(deg.idx, which(sim$differential == 1)))
    idx.diff = sim.norm$rand.subset[which(sim$differential[sim.norm$rand.subset] == 1)]

    data.frame('cdev' = sim.norm$cdev,
               'mseRefs' = sim.norm$mseRefs,
               'foldChange' = sim.norm$foldChange,
               'diffFraction' = sim.norm$diffFraction,
               'n_diffs_in_scale' = sim.norm$n_diffs_in_scale,
               'diffVarTotal' = sim.norm$diffVarTotal,
               'AUC' = auc,
               'TPR' = TP/length(which(sim$differential == 1)),
               'FDR' = FP / length(deg.idx),
               'FPR' = FP / length(which(sim$differential == 0))
               )
}) %>%
    do.call(rbind, .)
pl_cdev_vs_df.foldchange = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=foldChange, group=foldChange), alpha = 0.5) +
    # guides(color=guide_legend(title = 'Fold-change')) +
    theme_bw(base_size = THEME_BASE_SIZE)
pl_df_vs_auc.foldchange = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=AUC, color=foldChange, group=foldChange), alpha = 0.5) +
    # guides(color=guide_legend(title = 'Fold-change')) +
    theme_bw(base_size = THEME_BASE_SIZE)
pl_cdev_vs_auc.foldchange = ggplot(performance.df) +
    geom_point(aes(x=cdev,y=AUC, color=foldChange, group=foldChange), alpha = 0.5) +
    # guides(color=guide_legend(title = 'Fold-change')) +
    theme_bw(base_size = THEME_BASE_SIZE)

print(pl_cdev_vs_df.foldchange)
print(pl_cdev_vs_auc.foldchange)
print(pl_df_vs_auc.foldchange)

pl_mse_vs_df.foldchange = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=mseRefs, color=foldChange, group=foldChange), alpha = 0.5) +
    # guides(color=guide_legend(title = 'Fold-change')) +
    theme_bw(base_size = THEME_BASE_SIZE)
pl_mse_vs_auc.foldchange = ggplot(performance.df) +
    geom_point(aes(x=mseRefs,y=AUC, color=foldChange, group=foldChange), alpha = 0.5) +
    # guides(color=guide_legend(title = 'Fold-change')) +
    theme_bw(base_size = THEME_BASE_SIZE)

print(pl_mse_vs_df.foldchange)
print(pl_mse_vs_auc.foldchange)

Varying DE percentage

NSAMPLES = 40
nGene = 10000 
percentDEs = c(1:9/10) 
foldChange = 4
sims_p <- lapply(percentDEs, function(percentDE) {
    simulate.counts(nGene, NSAMPLES/2,NSAMPLES/2,fc = foldChange, percentDE = percentDE, completelyAsymm = TRUE)
}) %>%
    set_names(as.character(percentDEs))

sims_p.ref = lapply(percentDEs, function(percentDE) {
    sim = sims_p[[as.character(percentDE)]]
    normalize.by.refs(sim$counts, which(sim$differential == 0))
}) %>%
    set_names(as.character(percentDEs))

Normalizing by a random set of equivalent size to the true reference set

# diffFractions = c(0:50/50)

sims_p.rand.norms =
    lapply(percentDEs, function(percentDE) {
        sim = sims_p[[as.character(percentDE)]]
        sim.norm.ref = sims_p.ref[[as.character(percentDE)]]
        trueRef.idx = which(sim$differential == 0)
        trueDif.idx = which(sim$differential == 1)

        NSCALE = min(length(trueDif.idx), length(trueRef.idx))
        mclapply(seq(1,NSCALE, floor(NSCALE/n_data_points)), mc.cores = 1, FUN = function(n_diff) {
            lapply(1:10, function(i) {
                rand.dif = sample(trueDif.idx, size = n_diff)
                rand.ref = sample(trueRef.idx, size = NSCALE - n_diff)

                rand.subset = union(rand.ref, rand.dif)
                raw = colSums(sim$counts[rand.subset,,drop= FALSE])
                sim.norm.i = normalize.by.refs(sim$counts, rand.subset)
                scalingFactors = raw / geom.mean(raw)
                mseRefs = mean((logFC(sim.norm.i[trueRef.idx,], sim$conditions$condition, log.base = 2, pseudocount = 1))^2) 
                return(list('scalingFactors' = scalingFactors,
                            'percentDE' = percentDE,
                            'rand.subset' = rand.subset,
                            'n_diffs_in_scale' = n_diff,
                            'diffFraction' = n_diff/NSCALE,
                            'diffVarTotal' = sum(var(sim$counts[rand.dif,])), # The total variance of the DE genes in the random set selected for scaling
                            'cdev' =  cdev(sim.norm.i, sim.norm.ref),
                            'mseRefs' = mseRefs))
            }) 
        }) %>%
        unlist(recursive = FALSE)
    }) %>%
    unlist(recursive = FALSE)

DE tests

p_de.results = lapply(sims_p.rand.norms, function(sim.norm) {

    sim = sims_p[[as.character(sim.norm$percentDE)]]
    de.test.limma(sim$counts,
            group = sim$conditions$condition,
            norm.factors = sim.norm$scalingFactors) %>%
        limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH')
})

Relating to normalization quality

performance.df = lapply(1:length(sims_p.rand.norms), function(i) {

    sim.norm = sims_p.rand.norms[[i]]
    sim = sims_p[[as.character(sim.norm$percentDE)]]
    de = p_de.results[[i]]
    deg.idx = which(de$adj.P.Val < 0.01)

    auc = auroc(1-de$adj.P.Val, !!sim$differential)
    TP = length(intersect(which(sim$differential == 1), deg.idx))
    FP = length(setdiff(deg.idx, which(sim$differential == 1)))
    idx.diff = sim.norm$rand.subset[which(sim$differential[sim.norm$rand.subset] == 1)]

    data.frame('cdev' = sim.norm$cdev,
               'mseRefs' = sim.norm$mseRefs,
               'percentDE' = sim.norm$percentDE,
               'diffFraction' = sim.norm$diffFraction,
               'n_diffs_in_scale' = sim.norm$n_diffs_in_scale,
               'diffVarTotal' = sim.norm$diffVarTotal,
               'AUC' = auc,
               'TPR' = TP/length(which(sim$differential == 1)),
               'FDR' = FP / length(deg.idx),
               'FPR' = FP / length(which(sim$differential == 0))
               )
}) %>%
    do.call(rbind, .)
pl_cdev_vs_df.percentDE = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)

pl_cdev_vs_auc.percentDE = ggplot(performance.df) +
    geom_point(aes(x=cdev,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(pl_cdev_vs_df.percentDE)
print(pl_cdev_vs_auc.percentDE)

pl_mse_vs_df.percentDE = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=mseRefs, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)

pl_mse_vs_auc.percentDE = ggplot(performance.df) +
    geom_point(aes(x=mseRefs,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(pl_mse_vs_df.percentDE)
print(pl_mse_vs_auc.percentDE)

ggplot(performance.df) +
    facet_grid(percentDE ~ .) +
    geom_point(aes(x=cdev,y=AUC, color=percentDE, group=percentDE), alpha = 0.3)

Normalizing by a random set of any size

# diffFractions = c(0:50/50)

sims_p.rand.norms =
    lapply(percentDEs, function(percentDE) {
        sim = sims_p[[as.character(percentDE)]]
        sim.norm.ref = sims_p.ref[[as.character(percentDE)]]
        trueRef.idx = which(sim$differential == 0)
        trueDif.idx = which(sim$differential == 1)

        NSCALE = min(length(trueDif.idx), length(trueRef.idx))
        mclapply(seq(1,NSCALE, floor(NSCALE/n_data_points)), mc.cores = 1, FUN = function(n_diff) {
            lapply(1:10, function(i) {
                rand.dif = sample(trueDif.idx, size = n_diff)
                rand.ref = sample(trueRef.idx, size = NSCALE - n_diff)

                rand.subset = union(rand.ref, rand.dif)
                raw = colSums(sim$counts[rand.subset,,drop= FALSE])
                sim.norm.i = normalize.by.refs(sim$counts, rand.subset)
                scalingFactors = raw / geom.mean(raw)

                return(list('scalingFactors' = scalingFactors,
                            'percentDE' = percentDE,
                            'rand.subset' = rand.subset,
                            'n_diffs_in_scale' = n_diff,
                            'diffFraction' = n_diff/NSCALE,
                            'diffVarTotal' = sum(var(sim$counts[rand.dif,])), # The total variance of the DE genes in the random set selected for scaling
                            'cdev' =  cdev(sim.norm.i, sim.norm.ref)))
            }) 
        }) %>%
        unlist(recursive = FALSE)
    }) %>%
    unlist(recursive = FALSE)

performance.df = lapply(1:length(sims_p.rand.norms), function(i) {

    sim.norm = sims_p.rand.norms[[i]]
    sim = sims_p[[as.character(sim.norm$percentDE)]]

    data.frame('cdev' = sim.norm$cdev,
               'percentDE' = sim.norm$percentDE,
               'diffFraction' = sim.norm$diffFraction,
               'n_diffs_in_scale' = sim.norm$n_diffs_in_scale,
               'diffVarTotal' = sim.norm$diffVarTotal
               )
}) %>%
    do.call(rbind, .)

ggplot(performance.df) +
    facet_grid(percentDE ~ .) +
    geom_point(aes(x=n_diffs_in_scale,y=cdev, color=percentDE, group=percentDE), alpha = 0.3)

ggplot(performance.df) +
    geom_point(aes(x=n_diffs_in_scale,y=cdev, color=percentDE, group=percentDE), alpha = 0.3)
ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=percentDE, group=percentDE), alpha = 0.3)
# ggplot(performance.df) +
#     geom_point(aes(x=diffVarTotal,y=cdev, color=(nGene), group=nGene), alpha = 0.3)

Heterogeneous fold-changes

Varying number of genes

NSAMPLES = 40
nGenes = c(2000,10000, 20000, 50000)
percentDE = 0.5 
sims_rg <- lapply(nGenes, function(nGene) {
    simulate.counts.varyingfc(ngenes = nGene, nlibs1 = NSAMPLES %/% 2, nlibs2= NSAMPLES - (NSAMPLES %/% 2), percentDE = percentDE)
}) %>%
    set_names(as.character(nGenes))

sims_rg.ref = lapply(nGenes, function(nGene) {
    sim = sims_rg[[as.character(nGene)]]
    normalize.by.refs(sim$counts, which(sim$differential == 0))
}) %>%
    set_names(as.character(nGenes))

sims_rg.rand.norms =
    lapply(nGenes, function(nGene) {
        sim = sims_rg[[as.character(nGene)]]
        sim.norm.ref = sims_rg.ref[[as.character(nGene)]]
        trueRef.idx = which(sim$differential == 0)
        trueDif.idx = which(sim$differential == 1)
        NSCALE = min(length(trueDif.idx), length(trueRef.idx))
        mclapply(seq(1,NSCALE, floor(NSCALE/n_data_points)), mc.cores = 1, FUN = function(n_diff) {
            lapply(1:10, function(i) {
                rand.dif = sample(trueDif.idx, size = n_diff)
                rand.ref = sample(trueRef.idx, size = NSCALE - n_diff)
                rand.subset = union(rand.ref, rand.dif)
                raw = colSums(sim$counts[rand.subset,,drop= FALSE])
                sim.norm.i = normalize.by.refs(sim$counts, rand.subset)
                scalingFactors = raw / geom.mean(raw)
                mseRefs = mean((logFC(sim.norm.i[trueRef.idx,], sim$conditions$condition, log.base = 2, pseudocount = 1))^2) 
                return(list('scalingFactors' = scalingFactors,
                            'nGene' = nGene,
                            'rand.subset' = rand.subset,
                            'diffFraction' = n_diff/NSCALE,
                            'n_diffs_in_scale' = n_diff,
                            'diffVarTotal' = sum(var(sim$counts[rand.dif,])), # The total variance of the DE genes in the random set selected for scaling
                            'cdev' =  cdev(sim.norm.i, sim.norm.ref),
                            'mseRefs' = mseRefs))
            }) 
        }) %>%
        unlist(recursive = FALSE)
    }) %>%
    unlist(recursive = FALSE)

DE tests

rg_de.results = lapply(sims_rg.rand.norms, function(sim.norm) {

    sim = sims_rg[[as.character(sim.norm$nGene)]]
    de.test.limma(sim$counts,
            group = sim$conditions$condition,
            norm.factors = sim.norm$scalingFactors) %>%
        limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH')
})

Relating to normalization quality

performance.df = lapply(1:length(sims_rg.rand.norms), function(i) {

    sim.norm = sims_rg.rand.norms[[i]]
    sim = sims_rg[[as.character(sim.norm$nGene)]]
    de = rg_de.results[[i]]
    deg.idx = which(de$adj.P.Val < 0.01)
    auc = auroc(1-de$adj.P.Val, !!sim$differential)
    TP = length(intersect(which(sim$differential == 1), deg.idx))
    FP = length(setdiff(deg.idx, which(sim$differential == 1)))
    idx.diff = sim.norm$rand.subset[which(sim$differential[sim.norm$rand.subset] == 1)]

    data.frame('cdev' = sim.norm$cdev,
               'mseRefs' = sim.norm$mseRefs,
               'nGene' = sim.norm$nGene,
               'diffFraction' = sim.norm$diffFraction,
               'n_diffs_in_scale' = sim.norm$n_diffs_in_scale,
               'diffVarTotal' = sim.norm$diffVarTotal,
               'AUC' = auc,
               'TPR' = TP/length(which(sim$differential == 1)),
               'FDR' = FP / length(deg.idx),
               'FPR' = FP / length(which(sim$differential == 0))
               )
}) %>%
    do.call(rbind, .)

ggplot(performance.df) +
    facet_grid(. ~ nGene) +
    geom_point(aes(x=AUC,y=cdev, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
ggplot(performance.df) +
    facet_grid(. ~ nGene) +
    geom_point(aes(x=cdev,y=AUC, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)

p2_cdev_vs_df.ngene = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(p2_cdev_vs_df.ngene)

p2_cdev_vs_auc.ngene = ggplot(performance.df) +
    geom_point(aes(x=cdev,y=AUC, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE) +
    ylim(c(0,1))
print(p2_cdev_vs_auc.ngene)

p2_mse_vs_df.ngene = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=mseRefs, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
print(p2_mse_vs_df.ngene)

p2_mse_vs_auc.ngene = ggplot(performance.df) +
    geom_point(aes(x=mseRefs,y=AUC, color=nGene, group=nGene), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE) +
    ylim(c(0,1))
print(p2_mse_vs_auc.ngene)

Varying DE percentage

NSAMPLES = 40
nGene = 10000 
percentDEs = c(1:9/10) 
sims_rp <- lapply(percentDEs, function(percentDE) {
    simulate.counts.varyingfc(ngenes = nGene, nlibs1 = NSAMPLES %/% 2, nlibs2= NSAMPLES - (NSAMPLES %/% 2), percentDE = percentDE)
}) %>%
    set_names(as.character(percentDEs))

sims_rp.ref = lapply(percentDEs, function(percentDE) {
    sim = sims_rp[[as.character(percentDE)]]
    normalize.by.refs(sim$counts, which(sim$differential == 0))
}) %>%
    set_names(as.character(percentDEs))

Normalizing by a random set of equivalent size to the true reference set

# diffFractions = c(0:50/50)

sims_rp.rand.norms =
    lapply(percentDEs, function(percentDE) {
        sim = sims_rp[[as.character(percentDE)]]
        sim.norm.ref = sims_rp.ref[[as.character(percentDE)]]
        trueRef.idx = which(sim$differential == 0)
        trueDif.idx = which(sim$differential == 1)

        NSCALE = min(length(trueDif.idx), length(trueRef.idx))
        mclapply(seq(1,NSCALE, floor(NSCALE/n_data_points)), mc.cores = 1, FUN = function(n_diff) {
            lapply(1:10, function(i) {
                rand.dif = sample(trueDif.idx, size = n_diff)
                rand.ref = sample(trueRef.idx, size = NSCALE - n_diff)

                rand.subset = union(rand.ref, rand.dif)
                raw = colSums(sim$counts[rand.subset,,drop= FALSE])
                sim.norm.i = normalize.by.refs(sim$counts, rand.subset)
                scalingFactors = raw / geom.mean(raw)
                mseRefs = mean((logFC(sim.norm.i[trueRef.idx,], sim$conditions$condition, log.base = 2, pseudocount = 1))^2) 
                return(list('scalingFactors' = scalingFactors,
                            'percentDE' = percentDE,
                            'rand.subset' = rand.subset,
                            'n_diffs_in_scale' = n_diff,
                            'diffFraction' = n_diff/NSCALE,
                            'diffVarTotal' = sum(var(sim$counts[rand.dif,])), # The total variance of the DE genes in the random set selected for scaling
                            'cdev' =  cdev(sim.norm.i, sim.norm.ref),
                            'mseRefs' = mseRefs))
            }) 
        }) %>%
        unlist(recursive = FALSE)
    }) %>%
    unlist(recursive = FALSE)

DE tests

rp_de.results = lapply(sims_rp.rand.norms, function(sim.norm) {

    sim = sims_rp[[as.character(sim.norm$percentDE)]]
    de.test.limma(sim$counts,
            group = sim$conditions$condition,
            norm.factors = sim.norm$scalingFactors) %>%
        limma::topTable(n = Inf, sort.by = 'none', adjust.method = 'BH')
})

Relating to normalization quality

performance.df = lapply(1:length(sims_rp.rand.norms), function(i) {

    sim.norm = sims_rp.rand.norms[[i]]
    sim = sims_rp[[as.character(sim.norm$percentDE)]]
    de = rp_de.results[[i]]
    deg.idx = which(de$adj.P.Val < 0.01)

    auc = auroc(1-de$adj.P.Val, !!sim$differential)
    TP = length(intersect(which(sim$differential == 1), deg.idx))
    FP = length(setdiff(deg.idx, which(sim$differential == 1)))
    idx.diff = sim.norm$rand.subset[which(sim$differential[sim.norm$rand.subset] == 1)]

    data.frame('cdev' = sim.norm$cdev,
               'mseRefs' = sim.norm$mseRefs,
               'percentDE' = sim.norm$percentDE,
               'diffFraction' = sim.norm$diffFraction,
               'n_diffs_in_scale' = sim.norm$n_diffs_in_scale,
               'diffVarTotal' = sim.norm$diffVarTotal,
               'AUC' = auc,
               'TPR' = TP/length(which(sim$differential == 1)),
               'FDR' = FP / length(deg.idx),
               'FPR' = FP / length(which(sim$differential == 0))
               )
}) %>%
    do.call(rbind, .)
p2_cdev_vs_df.percentDE = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
# ggplot(performance.df) +
#     geom_point(aes(x=diffFraction,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
#     theme_bw(base_size = THEME_BASE_SIZE)

p2_cdev_vs_auc.percentDE = ggplot(performance.df) +
    geom_point(aes(x=cdev,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE) +
    ylim(c(0,1))

p2_mse_vs_df.percentDE = ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=mseRefs, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)
# ggplot(performance.df) +
#     geom_point(aes(x=diffFraction,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
#     theme_bw(base_size = THEME_BASE_SIZE)

p2_mse_vs_auc.percentDE = ggplot(performance.df) +
    geom_point(aes(x=mseRefs,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE) +
    ylim(c(0,1))
print(p2_cdev_vs_df.percentDE)
print(p2_cdev_vs_auc.percentDE)
print(p2_mse_vs_df.percentDE)
print(p2_mse_vs_auc.percentDE)

ggplot(performance.df) +
    facet_wrap('percentDE') +
    geom_point(aes(x=cdev,y=AUC, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)

ggplot(performance.df) +
    geom_point(aes(x=diffFraction,y=cdev, color=percentDE, group=percentDE), alpha = 0.3) +
    theme_bw(base_size = THEME_BASE_SIZE)

ggplot(performance.df) +
    geom_point(aes(x=diffVarTotal,y=cdev, color=percentDE, group=percentDE), alpha = 0.3)

Combining plots for publication figure

cdev vs normalization quality

p_cdev_upstream_downstream = egg::ggarrange(pl_cdev_vs_df.ngene + theme(legend.position = 'none',axis.title.x=element_blank()),pl_cdev_vs_auc.ngene + theme(axis.title.x=element_blank()),
               pl_cdev_vs_df.percentDE + theme(legend.position = 'none',axis.title.x=element_blank()),pl_cdev_vs_auc.percentDE  + theme(axis.title.x=element_blank()),
               pl_cdev_vs_df.foldchange + theme(legend.position = 'none',axis.title.x=element_blank()),pl_cdev_vs_auc.foldchange + theme(axis.title.x=element_blank()),
               p2_cdev_vs_df.ngene + theme(legend.position = 'none',axis.title.x=element_blank()),p2_cdev_vs_auc.ngene + theme(axis.title.x=element_blank()),
               p2_cdev_vs_df.percentDE + theme(legend.position = 'none'),p2_cdev_vs_auc.percentDE,
               ncol = 2)
ggsave('cdev-vs-downstream-upstream.png', plot = p_cdev_upstream_downstream)

Fold-change MSE vs normalization quality

p_mse_upstream_downstream = egg::ggarrange(pl_mse_vs_df.ngene + theme(legend.position = 'none',axis.title.x=element_blank()),pl_mse_vs_auc.ngene + theme(axis.title.x=element_blank()),
               pl_mse_vs_df.percentDE + theme(legend.position = 'none',axis.title.x=element_blank()),pl_mse_vs_auc.percentDE  + theme(axis.title.x=element_blank()),
               pl_mse_vs_df.foldchange + theme(legend.position = 'none',axis.title.x=element_blank()),pl_mse_vs_auc.foldchange + theme(axis.title.x=element_blank()),
               p2_mse_vs_df.ngene + theme(legend.position = 'none',axis.title.x=element_blank()),p2_mse_vs_auc.ngene + theme(axis.title.x=element_blank()),
               p2_mse_vs_df.percentDE + theme(legend.position = 'none'),p2_mse_vs_auc.percentDE,
               ncol = 2)
ggsave('mse-vs-downstream-upstream.png', plot = p_mse_upstream_downstream)

ttdtrang/cdev-paper documentation built on Dec. 23, 2021, 1:01 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ttdtrang/cdev-paper
cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

In ttdtrang/cdev-paper: cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

Homogeneous fold-change

Varying number of genes

DE tests

Relating to normalization quality

cdev vs reference set quality

cdev vs DE performance

MSE vs reference set quality

MSE vs DE performance

Varying fold-change

DE tests

Relating to normalization quality

Varying DE percentage

DE tests

Relating to normalization quality

Heterogeneous fold-changes

Varying number of genes

DE tests

Relating to normalization quality

Varying DE percentage

DE tests

Relating to normalization quality

Combining plots for publication figure

cdev vs normalization quality

Fold-change MSE vs normalization quality

R Package Documentation

Browse R Packages

We want your feedback!

ttdtrang/cdev-paper cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

In ttdtrang/cdev-paper: cdev: a ground-truth based measure to evaluate RNA-seq normalization methods

Homogeneous fold-change

Varying number of genes

DE tests

Relating to normalization quality

cdev vs reference set quality

cdev vs DE performance

MSE vs reference set quality

MSE vs DE performance

Varying fold-change

DE tests

Relating to normalization quality

Varying DE percentage

DE tests

Relating to normalization quality

Heterogeneous fold-changes

Varying number of genes

DE tests

Relating to normalization quality

Varying DE percentage

DE tests

Relating to normalization quality

Combining plots for publication figure

cdev vs normalization quality

Fold-change MSE vs normalization quality

R Package Documentation

Browse R Packages

We want your feedback!

ttdtrang/cdev-paper
cdev: a ground-truth based measure to evaluate RNA-seq normalization methods