In csoneson/countsimQC: Compare Characteristic Features of Count Data Sets

knitr::opts_chunk$set(echo = showCode, warning = FALSE, message = FALSE)
if (useRAGG) {
  knitr::opts_chunk$set(dev = "ragg_png", dpi = dpi)
}

# knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo")))
knitr::opts_knit$set(progress = FALSE, verbose = FALSE)

## Get the number of data sets
nDatasets <- length(ddsList)

## Determine the number of rows and columns in facetted plots 
## (first element = columns, second element = rows)
colRow <- grDevices::n2mfrow(length(ddsList))

## Set height/width of each panel in multi-panel plots
panelSize <- 4

## Define out.width and out.height for the different plot categories
scalefactor <- 96
## barplots, 10x7in
fw1 <- 10
fh1 <- 7
if (useRAGG) {
  ow1 <- paste0(fw1 * scalefactor, "px")
  oh1 <- NULL
} else {
  ow1 <- oh1 <- NULL
}
## multipanel, panelSize * colRow[1] x panelSize * colRow[2]
fw2 <- panelSize * colRow[1]
fh2 <- panelSize * colRow[2]
if (useRAGG) {
  ow2 <- paste0(fw2 * scalefactor, "px")
  oh2 <- NULL
} else {
  ow2 <- oh2 <- NULL
}
## scatter plots etc, 7x5in
fw3 <- 7
fh3 <- 5
if (useRAGG) {
  ow3 <- paste0(fw3 * scalefactor, "px")
  oh3 <- NULL
} else {
  ow3 <- oh3 <- NULL
}

## Initialize plot list
plots <- list()

## Define plot theme
thm <- theme_bw() + 
  theme(axis.text = element_text(size = 15),
        axis.title = element_text(size = 14),
        strip.text = element_text(size = 15))

## ----------------------- Calculate dispersions ----------------------- ##
if (!quiet) {
  message("Calculating dispersions. This can take some time, please be patient.")
}
if (is.finite(maxNForDisp)) {
  msg <- paste0("If there are more than ", maxNForDisp, 
                " samples in a data set, the dispersions are calculated using ",
                maxNForDisp, " randomly selected samples.")
} else {
  msg <- ""
}
obj <- calculateDispersionsddsList(ddsList = ddsList, 
                                   maxNForDisp = maxNForDisp)

## --------------- Calculate between-sample correlations --------------- ##
if (!quiet) {
  message("Calculating sample correlations.")
}
sampleCorrDF <- calculateSampleCorrs(ddsList = obj, 
                                     maxNForCorr = maxNForCorr)

## -------------- Calculate between-feature correlations --------------- ##
if (!quiet) {
  message("Calculating feature correlations.")
}
featureCorrDF <- calculateFeatureCorrs(ddsList = obj, 
                                       maxNForCorr = maxNForCorr)

## ----------------- Summarize sample characteristics ------------------ ##
if (!quiet) {
  message("Summarizing calculated values.")
}
sampleDF <- lapply(obj, function(x) {
  data.frame(
    Libsize = colSums(x$dge$counts),
    Fraczero = colMeans(x$dge$counts == 0),
    TMM = x$dge$samples$norm.factors
  ) %>% dplyr::mutate(EffLibsize = Libsize * TMM)
})
ns <- sapply(sampleDF, nrow)
sampleDF <- do.call(rbind, sampleDF) %>% 
  dplyr::mutate(dataset = rep(names(sampleDF), ns))

## ---------------- Summarize feature characteristics ------------------ ##
featureDF <- lapply(obj, function(x) {
  data.frame(
    Tagwise = sqrt(x$dge$tagwise.dispersion),
    Common = sqrt(x$dge$common.dispersion),
    Trend = sqrt(x$dge$trended.dispersion),
    AveLogCPM = x$dge$AveLogCPM,
    AveLogCPMDisp = x$dge$AveLogCPMDisp, 
    average_log2_cpm = apply(edgeR::cpm(x$dge, prior.count = 2, log = TRUE), 1, mean), 
    variance_log2_cpm = apply(edgeR::cpm(x$dge, prior.count = 2, log = TRUE), 1, var),
    Fraczero = rowMeans(x$dge$counts == 0),
    dispGeneEst = rowData(x$dds)$dispGeneEst,
    dispFit = rowData(x$dds)$dispFit,
    dispFinal = rowData(x$dds)$dispersion,
    baseMeanDisp = rowData(x$dds)$baseMeanDisp,
    baseMean = rowData(x$dds)$baseMean
  )
})
ns <- sapply(featureDF, nrow)
featureDF <- do.call(rbind, featureDF) %>% 
  dplyr::mutate(dataset = rep(names(featureDF), ns))

## ----------------- Summarize data set characteristics ---------------- ##
datasetDF <- do.call(rbind, lapply(obj, function(x) {
  data.frame(
    prior_df = paste0("prior.df = ", round(x$dge$prior.df, 2)),
    nVars = nrow(x$dge$counts),
    nSamples = ncol(x$dge$counts)
  )
})) %>% 
  dplyr::mutate(AveLogCPMDisp = 0.8 * max(featureDF$AveLogCPMDisp)) %>%
  dplyr::mutate(Tagwise = 0.9 * max(featureDF$Tagwise)) %>%
  dplyr::mutate(dataset = names(obj))

Description

r I(description)

Design formulae

The following model formulae were used in the dispersion calculations for the different data sets. Note that if a count matrix or data frame was provided in place of a DESeqDataSet for some data set, the corresponding design formula is set to ~1, thus assuming that all samples are to be treated as replicates.

## Print design information
for (ds in names(ddsList)) {
  cat(ds, ": ", as.character(DESeq2::design(ddsList[[ds]])), "\n")
}

Data set dimensions {.tabset .tabset-pills}

if (!quiet) {
  message("Processing data set dimensions.")
}

These bar plots show the number of samples (columns) and features (rows) in each data set.

Number of samples (columns)

plots[["nSamples"]] <- 
  ggplot(datasetDF, aes(x = dataset, y = nSamples, fill = dataset)) + 
  geom_bar(stat = "identity", alpha = 0.5) + 
  xlab("") + ylab("Number of samples (columns)") + 
  thm + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
plots[["nSamples"]]

Number of features (rows)

plots[["nVariables"]] <- 
  ggplot(datasetDF, aes(x = dataset, y = nVars, fill = dataset)) + 
  geom_bar(stat = "identity", alpha = 0.5) + 
  xlab("") + ylab("Number of features (rows)") + 
  thm + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5))
plots[["nVariables"]]

Dispersion/BCV plots {.tabset .tabset-pills}

if (!quiet) {
  message("Processing dispersion estimates.")
}

Disperson/BCV plots show the association between the average abundance and the dispersion or "biological coefficient of variation" (sqrt(dispersion)), as calculated by edgeR [@Robinson2010edgeR] and DESeq2 [@Love2014DESeq2]. In the edgeR plot, the estimate of the prior degrees of freedom is indicated. r I(msg)

edgeR

The black dots represent the tagwise dispersion estimates, the red line the common dispersion and the blue curve represents the trended dispersion estimates. For further information about the dispersion estimation in edgeR, see @Chen2014Dispersion.

plots[["BCVedgeR"]] <- 
  ggplot(featureDF %>% dplyr::arrange(AveLogCPMDisp), 
         aes(x = AveLogCPMDisp, y = Tagwise)) + 
  geom_point(size = 0.25, alpha = 0.5) + 
  facet_wrap(~dataset, nrow = colRow[2]) + 
  geom_line(aes(y = Trend), color = "blue", size = 1.5) + 
  geom_line(aes(y = Common), color = "red", size = 1.5) +
  geom_text(data = datasetDF, aes(label = prior_df)) + 
  xlab("Average log CPM") + ylab("Biological coefficient of variation") + 
  thm
plots[["BCVedgeR"]]

Pairwise comparisons - edgeR

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average log CPM and tagwise dispersion", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)

if (calculateStatistics) {
  makeDF(df = featureDF, column = c("AveLogCPMDisp", "Tagwise"), 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

DESeq2

The black dots are the gene-wise dispersion estimates, the red curve the fitted mean-dispersion relationship and the blue circles represent the final dispersion estimates.For further information about the dispersion estimation in DESeq2, see @Love2014DESeq2.

plots[["dispersionDESeq2"]] <- 
  ggplot(featureDF %>% dplyr::arrange(baseMeanDisp), 
         aes(x = baseMeanDisp, y = dispGeneEst)) + 
  geom_point(size = 0.25, alpha = 0.5) + 
  facet_wrap(~dataset, nrow = colRow[2]) + scale_x_log10() + scale_y_log10() +  
  geom_point(aes(y = dispFinal), color = "lightblue", shape = 21) + 
  geom_line(aes(y = dispFit), color = "red", size = 1.5) + 
  xlab("Base mean") + ylab("Dispersion") + 
  thm
plots[["dispersionDESeq2"]]

Pairwise comparisons - DESeq2

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "base mean and genewise dispersion", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)

if (calculateStatistics) {
  makeDF(df = featureDF, column = c("baseMeanDisp", "dispGeneEst"), 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Mean-variance plots {.tabset .tabset-pills}

if (!quiet) {
  message("Processing mean-variance relationship.")
}

These scatter plots show the relation between the empirical mean and variance of the features. The difference between these mean-variance plots and the mean-dispersion plots above is that the plots in this section do not take the information about the experimental design and sample grouping into account, but simply display the mean and variance of log2(CPM) estimates across all samples, calculated using the cpm function from edgeR [@Robinson2010edgeR], with a prior count of 2.

Separate scatter plots

plots[["meanVarSepScatter"]] <- 
  ggplot(featureDF, aes(x = average_log2_cpm, y = variance_log2_cpm)) + 
  geom_point(size = 0.75, alpha = 0.5) + 
  facet_wrap(~dataset, nrow = colRow[2]) + 
  xlab("Mean of log2(CPM)") + ylab("Variance of log2(CPM)") + 
  thm
plots[["meanVarSepScatter"]]

Overlaid scatter plots

plots[["meanVarOverlaidScatter"]] <- 
  ggplot(featureDF, aes(x = average_log2_cpm, y = variance_log2_cpm, color = dataset)) + 
  geom_point(size = 0.75, alpha = 0.5) +
  xlab("Mean of log2(CPM)") + ylab("Variance of log2(CPM)") + 
  thm
plots[["meanVarOverlaidScatter"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average and variance of the log CPM", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)

if (calculateStatistics) {
  makeDF(df = featureDF, column = c("average_log2_cpm", "variance_log2_cpm"), 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Library sizes {.tabset .tabset-pills}

if (!quiet) {
  message("Processing library sizes.")
}

These plots illustrate the distribution of the total read count per sample, i.e., the column sums of the respective data matrices.

Separate histograms

plots[["libsizeSepHist"]] <- 
  ggplot(sampleDF, aes(x = Libsize)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Library size") + thm
plots[["libsizeSepHist"]]

Overlaid histograms

plots[["libsizeOverlaidHist"]] <- 
  ggplot(sampleDF, aes(x = Libsize, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Library size") + thm
plots[["libsizeOverlaidHist"]]

Density plots

plots[["libsizeDensity"]] <- 
  ggplot(sampleDF, aes(x = Libsize, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Library size") + thm
plots[["libsizeDensity"]]

Density plots (filled)

plots[["libsizeDensityFilled"]] <- 
  ggplot(sampleDF, aes(x = Libsize, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Library size") + thm
plots[["libsizeDensityFilled"]]

Box plots

plots[["libsizeBox"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = Libsize, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Library size") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["libsizeBox"]]

Violin plots

plots[["libsizeViolin"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = Libsize, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Library size") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["libsizeViolin"]]

Empirical cumulative distribution function

plots[["libsizeEcdf"]] <- 
  ggplot(sampleDF, aes(x = Libsize, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Library size") + thm
plots[["libsizeEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "library size", minvalue = min(sampleDF$Libsize, na.rm = TRUE), maxvalue = max(sampleDF$Libsize, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = sampleDF, column = "Libsize", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

TMM normalization factors {.tabset .tabset-pills}

if (!quiet) {
  message("Processing TMM normalization factors.")
}

The plots below show the distribution of the TMM normalization factors [@Robinson2010TMM], intended to adjust for differences in RNA composition, as calculated by edgeR [@Robinson2010edgeR].

Separate histograms

plots[["tmmSepHist"]] <- 
  ggplot(sampleDF, aes(x = TMM)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("TMM normalization factor") + thm
plots[["tmmSepHist"]]

Overlaid histograms

plots[["tmmOverlaidHist"]] <- 
  ggplot(sampleDF, aes(x = TMM, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("TMM normalization factor") + thm
plots[["tmmOverlaidHist"]]

Density plots

plots[["tmmDensity"]] <- 
  ggplot(sampleDF, aes(x = TMM, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("TMM normalization factor") + thm
plots[["tmmDensity"]]

Density plots (filled)

plots[["tmmDensityFilled"]] <- 
  ggplot(sampleDF, aes(x = TMM, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("TMM normalization factor") + thm
plots[["tmmDensityFilled"]]

Box plots

plots[["tmmBox"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = TMM, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("TMM normalization factor") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["tmmBox"]]

Violin plots

plots[["tmmViolin"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = TMM, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("TMM normalization factor") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["tmmViolin"]]

Empirical cumulative distribution function

plots[["tmmEcdf"]] <- 
  ggplot(sampleDF, aes(x = TMM, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("TMM normalization factor") + thm
plots[["tmmEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "TMM factor", minvalue = min(sampleDF$TMM, na.rm = TRUE), maxvalue = max(sampleDF$TMM, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = sampleDF, column = "TMM", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Effective library sizes {.tabset .tabset-pills}

if (!quiet) {
  message("Processing effective library sizes.")
}

These plots show the distribution of the "effective library sizes", defined as the total count per sample multiplied by the corresponding TMM normalization factor.

Separate histograms

plots[["effLibsizeSepHist"]] <- 
  ggplot(sampleDF, aes(x = EffLibsize)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Effective library size") + thm
plots[["effLibsizeSepHist"]]

Overlaid histograms

plots[["effLibsizeOverlaidHist"]] <- 
  ggplot(sampleDF, aes(x = EffLibsize, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Effective library size") + thm
plots[["effLibsizeOverlaidHist"]]

Density plots

plots[["effLibsizeDensity"]] <- 
  ggplot(sampleDF, aes(x = EffLibsize, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Effective library size") + thm
plots[["effLibsizeDensity"]]

Density plots (filled)

plots[["effLibsizeDensityFilled"]] <- 
  ggplot(sampleDF, aes(x = EffLibsize, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Effective library size") + thm
plots[["effLibsizeDensityFilled"]]

Box plots

plots[["effLibsizeBox"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = EffLibsize, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Effective library size") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["effLibsizeBox"]]

Violin plots

plots[["effLibsizeViolin"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = EffLibsize, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Effective library size") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["effLibsizeViolin"]]

Empirical cumulative distribution function

plots[["effLibsizeEcdf"]] <- 
  ggplot(sampleDF, aes(x = EffLibsize, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Effective library size") + thm
plots[["effLibsizeEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "effective library size", minvalue = min(sampleDF$EffLibsize, na.rm = TRUE), maxvalue = max(sampleDF$EffLibsize, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = sampleDF, column = "EffLibsize", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Expression distributions (average log CPM) {.tabset .tabset-pills}

if (!quiet) {
  message("Processing expression distributions.")
}

The plots in this section show the distribution of average abundance values for the features. The abundances are log CPM values calculated by edgeR.

Separate histograms

plots[["logCPMSepHist"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Average log CPM") + thm
plots[["logCPMSepHist"]]

Overlaid histograms

plots[["logCPMOverlaidHist"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Average log CPM") + thm
plots[["logCPMOverlaidHist"]]

Density plots

plots[["logCPMDensity"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Average log CPM") + thm
plots[["logCPMDensity"]]

Density plots (filled)

plots[["logCPMDensityFilled"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Average log CPM") + thm
plots[["logCPMDensityFilled"]]

Box plots

plots[["logCPMBox"]] <- 
  ggplot(featureDF, aes(x = dataset, y = AveLogCPM, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Average log CPM") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["logCPMBox"]]

Violin plots

plots[["logCPMViolin"]] <- 
  ggplot(featureDF, aes(x = dataset, y = AveLogCPM, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Average log CPM") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["logCPMViolin"]]

Empirical cumulative distribution function

plots[["logCPMEcdf"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Average log CPM") + thm
plots[["logCPMEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average log CPM", minvalue = min(featureDF$AveLogCPM, na.rm = TRUE), maxvalue = max(featureDF$AveLogCPM, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = featureDF, column = "AveLogCPM", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Fraction zeros per sample {.tabset .tabset-pills}

if (!quiet) {
  message("Processing fraction zeros per sample.")
}

These plots show the distribution of the fraction of zeros observed per sample (column) in the count matrices.

Separate histograms

plots[["fraczeroSampleSepHist"]] <- 
  ggplot(sampleDF, aes(x = Fraczero)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleSepHist"]]

Overlaid histograms

plots[["fraczeroSampleOverlaidHist"]] <- 
  ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleOverlaidHist"]]

Density plots

plots[["fraczeroSampleDensity"]] <- 
  ggplot(sampleDF, aes(x = Fraczero, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleDensity"]]

Density plots (filled)

plots[["fraczeroSampleDensityFilled"]] <- 
  ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleDensityFilled"]]

Box plots

plots[["fraczeroSampleBox"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = Fraczero, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Fraction zeros per sample") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroSampleBox"]]

Violin plots

plots[["fraczeroSampleViolin"]] <- 
  ggplot(sampleDF, aes(x = dataset, y = Fraczero, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Fraction zeros per sample") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroSampleViolin"]]

Empirical cumulative distribution function

plots[["fraczeroSampleEcdf"]] <- 
  ggplot(sampleDF, aes(x = Fraczero, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Fraction zeros per sample") + thm
plots[["fraczeroSampleEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "fraction zeros", minvalue = min(sampleDF$Fraczero, na.rm = TRUE), maxvalue = max(sampleDF$Fraczero, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = sampleDF, column = "Fraczero", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Fraction zeros per feature {.tabset .tabset-pills}

if (!quiet) {
  message("Processing fraction zeros per feature.")
}

These plots illustrate the distribution of the fraction of zeros observed per feature (row) in the count matrices.

Separate histograms

plots[["fraczeroFeatureSepHist"]] <- 
  ggplot(featureDF, aes(x = Fraczero)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureSepHist"]]

Overlaid histograms

plots[["fraczeroFeatureOverlaidHist"]] <- 
  ggplot(featureDF, aes(x = Fraczero, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureOverlaidHist"]]

Density plots

plots[["fraczeroFeatureDensity"]] <- 
  ggplot(featureDF, aes(x = Fraczero, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureDensity"]]

Density plots (filled)

plots[["fraczeroFeatureDensityFilled"]] <- 
  ggplot(featureDF, aes(x = Fraczero, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureDensityFilled"]]

Box plots

plots[["fraczeroFeatureBox"]] <- 
  ggplot(featureDF, aes(x = dataset, y = Fraczero, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Fraction zeros per feature") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroFeatureBox"]]

Violin plots

plots[["fraczeroFeatureViolin"]] <- 
  ggplot(featureDF, aes(x = dataset, y = Fraczero, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Fraction zeros per feature") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["fraczeroFeatureViolin"]]

Empirical cumulative distribution function

plots[["fraczeroFeatureEcdf"]] <- 
  ggplot(featureDF, aes(x = Fraczero, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Fraction zeros per feature") + thm
plots[["fraczeroFeatureEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "fraction zeros", minvalue = min(featureDF$Fraczero, na.rm = TRUE), maxvalue = max(featureDF$Fraczero, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = featureDF, column = "Fraczero", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Sample-sample correlations {.tabset .tabset-pills}

if (!quiet) {
  message("Processing sample-sample correlations.")
}

The plots below show the distribution of Spearman correlation coefficients for pairs of samples, calculated from the log(CPM) values obtained via the cpm function from edgeR, with a prior.count of 2. If there are more than r maxNForCorr samples in a data set, the pairwise correlations between r maxNForCorr randomly selected samples are shown.

Separate histograms

plots[["sampleCorrSepHist"]] <- 
  ggplot(sampleCorrDF, aes(x = Correlation)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Sample-sample correlation") + thm
plots[["sampleCorrSepHist"]]

Overlaid histograms

plots[["sampleCorrOverlaidHist"]] <- 
  ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Sample-sample correlation") + thm
plots[["sampleCorrOverlaidHist"]]

Density plots

plots[["sampleCorrDensity"]] <- 
  ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Sample-sample correlation") + thm
plots[["sampleCorrDensity"]]

Density plots (filled)

plots[["sampleCorrDensityFilled"]] <- 
  ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Sample-sample correlation") + thm
plots[["sampleCorrDensityFilled"]]

Box plots

plots[["sampleCorrBox"]] <- 
  ggplot(sampleCorrDF, aes(x = dataset, y = Correlation, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Sample-sample correlation") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["sampleCorrBox"]]

Violin plots

plots[["sampleCorrViolin"]] <- 
  ggplot(sampleCorrDF, aes(x = dataset, y = Correlation, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Sample-sample correlation") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["sampleCorrViolin"]]

Empirical cumulative distribution function

plots[["sampleCorrEcdf"]] <- 
  ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Sample-sample correlation") + thm
plots[["sampleCorrEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample pair", aspect = "Spearman correlation", minvalue = min(sampleCorrDF$Correlation, na.rm = TRUE), maxvalue = max(sampleCorrDF$Correlation, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = sampleCorrDF, column = "Correlation", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Feature-feature correlations {.tabset .tabset-pills}

if (!quiet) {
  message("Processing feature-feature correlations.")
}

These plots illustrate the distribution of Spearman correlation coefficients for pairs of features, calculated from the log(CPM) values obtained via the cpm function from edgeR, with a prior.count of 2. Only non-constant features are considered, and if there are more than r maxNForCorr such features in a data set, the pairwise correlations between r maxNForCorr randomly selected features are shown.

Separate histograms

plots[["featureCorrSepHist"]] <- 
  ggplot(featureCorrDF, aes(x = Correlation)) + geom_histogram(bins = 30) + 
  facet_wrap(~dataset, nrow = colRow[2]) +
  xlab("Feature-feature correlation") + thm
plots[["featureCorrSepHist"]]

Overlaid histograms

plots[["featureCorrOverlaidHist"]] <- 
  ggplot(featureCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + 
  geom_histogram(bins = 30, alpha = 0.5, position = "identity") +
  xlab("Feature-feature correlation") + thm
plots[["featureCorrOverlaidHist"]]

Density plots

plots[["featureCorrDensity"]] <- 
  ggplot(featureCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + 
  geom_line(alpha = 0.5, size = 1.5, stat = "density") +
  xlab("Feature-feature correlation") + thm
plots[["featureCorrDensity"]]

Density plots (filled)

plots[["featureCorrDensityFilled"]] <- 
  ggplot(featureCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + 
  geom_density(alpha = 0.5) +
  xlab("Feature-feature correlation") + thm
plots[["featureCorrDensityFilled"]]

Box plots

plots[["featureCorrBox"]] <- 
  ggplot(featureCorrDF, aes(x = dataset, y = Correlation, color = dataset)) + 
  geom_boxplot(outlier.size = -1, alpha = 0.5) + 
  geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + 
  ylab("Feature-feature correlation") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["featureCorrBox"]]

Violin plots

plots[["featureCorrViolin"]] <- 
  ggplot(featureCorrDF, aes(x = dataset, y = Correlation, fill = dataset)) + 
  geom_violin(alpha = 0.5) + 
  ylab("Feature-feature correlation") + xlab("") + thm + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
plots[["featureCorrViolin"]]

Empirical cumulative distribution function

plots[["featureCorrEcdf"]] <- 
  ggplot(featureCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + 
  stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + 
  xlab("Feature-feature correlation") + thm
plots[["featureCorrEcdf"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature pair", aspect = "Spearman correlation", minvalue = min(featureCorrDF$Correlation, na.rm = TRUE), maxvalue = max(featureCorrDF$Correlation, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)

if (calculateStatistics) {
  makeDF(df = featureCorrDF, column = "Correlation", 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Library size vs fraction zeros {.tabset .tabset-pills}

if (!quiet) {
  message("Processing library size vs fraction zeros relationship.")
}

These scatter plots show the association between the total count (column sums) and the fraction of zeros observed per sample.

Separate scatter plots

plots[["libsizeFraczeroSepScatter"]] <- 
  ggplot(sampleDF, aes(x = Libsize, y = Fraczero)) + 
  geom_point(size = 1, alpha = 0.5) + 
  facet_wrap(~dataset, nrow = colRow[2]) + 
  xlab("Library size") + ylab("Fraction zeros") + thm
plots[["libsizeFraczeroSepScatter"]]

Overlaid scatter plots

plots[["libsizeFraczeroOverlaidScatter"]] <- 
  ggplot(sampleDF, aes(x = Libsize, y = Fraczero, color = dataset)) + 
  geom_point(size = 1, alpha = 0.5) + 
  geom_line(stat = "smooth", method = "loess", size = 1, alpha = 0.5) + 
  xlab("Library size") + ylab("Fraction zeros") + thm
plots[["libsizeFraczeroOverlaidScatter"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "library size and fraction of zeros", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)

if (calculateStatistics) {
  makeDF(df = sampleDF, column = c("Libsize", "Fraczero"), 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

Mean expression vs fraction zeros {.tabset .tabset-pills}

if (!quiet) {
  message("Processing expression vs fraction zeros relationship.")
}

These scatter plots show the association between the average abundance and the fraction of zeros observed per feature. The abundance is defined as the log(CPM) values as calculated by edgeR.

Separate scatter plots

plots[["logCPMFraczeroSepScatter"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM, y = Fraczero)) + 
  geom_point(size = 0.75, alpha = 0.5) + 
  facet_wrap(~dataset, nrow = colRow[2]) + 
  xlab("Average log CPM") + ylab("Fraction zeros") + thm
plots[["logCPMFraczeroSepScatter"]]

Overlaid scatter plots

plots[["logCPMFraczeroOverlaidScatter"]] <- 
  ggplot(featureDF, aes(x = AveLogCPM, y = Fraczero, color = dataset)) + 
  geom_point(size = 0.75, alpha = 0.5) + 
  xlab("Average log CPM") + ylab("Fraction zeros") + thm
plots[["logCPMFraczeroOverlaidScatter"]]

Pairwise comparisons

r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average log CPM and fraction of zeros", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)

if (calculateStatistics) {
  makeDF(df = featureDF, column = c("AveLogCPM", "Fraczero"), 
         permutationPvalues = permutationPvalues, nPermutations = nPermutations, 
         subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac)
}

if (savePlots) {
  if (!quiet) {
    message("Saving plots.")
  }
  saveRDS(plots, 
          file = paste0(outputDir, "/",
                        tools::file_path_sans_ext(basename(outputFile)), "_ggplots.rds"))
}

Session info

sessionInfo()

References

csoneson/countsimQC documentation built on June 14, 2025, 12:21 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

csoneson/countsimQC Compare Characteristic Features of Count Data Sets

In csoneson/countsimQC: Compare Characteristic Features of Count Data Sets

Description

Design formulae

Data set dimensions {.tabset .tabset-pills}

Number of samples (columns)

Number of features (rows)

Dispersion/BCV plots {.tabset .tabset-pills}

edgeR

Pairwise comparisons - edgeR

DESeq2

Pairwise comparisons - DESeq2

Mean-variance plots {.tabset .tabset-pills}

Separate scatter plots

Overlaid scatter plots

Pairwise comparisons

Library sizes {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

TMM normalization factors {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

Effective library sizes {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

Expression distributions (average log CPM) {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

Fraction zeros per sample {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

Fraction zeros per feature {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

Sample-sample correlations {.tabset .tabset-pills}

Separate histograms

Overlaid histograms

Density plots

Density plots (filled)

Box plots

Violin plots

Empirical cumulative distribution function

Pairwise comparisons

Feature-feature correlations {.tabset .tabset-pills}

csoneson/countsimQC
Compare Characteristic Features of Count Data Sets