knitr::opts_chunk$set(echo = showCode, warning = FALSE, message = FALSE) if (useRAGG) { knitr::opts_chunk$set(dev = "ragg_png", dpi = dpi) } # knitr::opts_chunk$set(dev.args = list(png = list(type = "cairo"))) knitr::opts_knit$set(progress = FALSE, verbose = FALSE)
## Get the number of data sets nDatasets <- length(ddsList) ## Determine the number of rows and columns in facetted plots ## (first element = columns, second element = rows) colRow <- grDevices::n2mfrow(length(ddsList)) ## Set height/width of each panel in multi-panel plots panelSize <- 4 ## Define out.width and out.height for the different plot categories scalefactor <- 96 ## barplots, 10x7in fw1 <- 10 fh1 <- 7 if (useRAGG) { ow1 <- paste0(fw1 * scalefactor, "px") oh1 <- NULL } else { ow1 <- oh1 <- NULL } ## multipanel, panelSize * colRow[1] x panelSize * colRow[2] fw2 <- panelSize * colRow[1] fh2 <- panelSize * colRow[2] if (useRAGG) { ow2 <- paste0(fw2 * scalefactor, "px") oh2 <- NULL } else { ow2 <- oh2 <- NULL } ## scatter plots etc, 7x5in fw3 <- 7 fh3 <- 5 if (useRAGG) { ow3 <- paste0(fw3 * scalefactor, "px") oh3 <- NULL } else { ow3 <- oh3 <- NULL } ## Initialize plot list plots <- list() ## Define plot theme thm <- theme_bw() + theme(axis.text = element_text(size = 15), axis.title = element_text(size = 14), strip.text = element_text(size = 15)) ## ----------------------- Calculate dispersions ----------------------- ## if (!quiet) { message("Calculating dispersions. This can take some time, please be patient.") } if (is.finite(maxNForDisp)) { msg <- paste0("If there are more than ", maxNForDisp, " samples in a data set, the dispersions are calculated using ", maxNForDisp, " randomly selected samples.") } else { msg <- "" } obj <- calculateDispersionsddsList(ddsList = ddsList, maxNForDisp = maxNForDisp) ## --------------- Calculate between-sample correlations --------------- ## if (!quiet) { message("Calculating sample correlations.") } sampleCorrDF <- calculateSampleCorrs(ddsList = obj, maxNForCorr = maxNForCorr) ## -------------- Calculate between-feature correlations --------------- ## if (!quiet) { message("Calculating feature correlations.") } featureCorrDF <- calculateFeatureCorrs(ddsList = obj, maxNForCorr = maxNForCorr) ## ----------------- Summarize sample characteristics ------------------ ## if (!quiet) { message("Summarizing calculated values.") } sampleDF <- lapply(obj, function(x) { data.frame( Libsize = colSums(x$dge$counts), Fraczero = colMeans(x$dge$counts == 0), TMM = x$dge$samples$norm.factors ) %>% dplyr::mutate(EffLibsize = Libsize * TMM) }) ns <- sapply(sampleDF, nrow) sampleDF <- do.call(rbind, sampleDF) %>% dplyr::mutate(dataset = rep(names(sampleDF), ns)) ## ---------------- Summarize feature characteristics ------------------ ## featureDF <- lapply(obj, function(x) { data.frame( Tagwise = sqrt(x$dge$tagwise.dispersion), Common = sqrt(x$dge$common.dispersion), Trend = sqrt(x$dge$trended.dispersion), AveLogCPM = x$dge$AveLogCPM, AveLogCPMDisp = x$dge$AveLogCPMDisp, average_log2_cpm = apply(edgeR::cpm(x$dge, prior.count = 2, log = TRUE), 1, mean), variance_log2_cpm = apply(edgeR::cpm(x$dge, prior.count = 2, log = TRUE), 1, var), Fraczero = rowMeans(x$dge$counts == 0), dispGeneEst = rowData(x$dds)$dispGeneEst, dispFit = rowData(x$dds)$dispFit, dispFinal = rowData(x$dds)$dispersion, baseMeanDisp = rowData(x$dds)$baseMeanDisp, baseMean = rowData(x$dds)$baseMean ) }) ns <- sapply(featureDF, nrow) featureDF <- do.call(rbind, featureDF) %>% dplyr::mutate(dataset = rep(names(featureDF), ns)) ## ----------------- Summarize data set characteristics ---------------- ## datasetDF <- do.call(rbind, lapply(obj, function(x) { data.frame( prior_df = paste0("prior.df = ", round(x$dge$prior.df, 2)), nVars = nrow(x$dge$counts), nSamples = ncol(x$dge$counts) ) })) %>% dplyr::mutate(AveLogCPMDisp = 0.8 * max(featureDF$AveLogCPMDisp)) %>% dplyr::mutate(Tagwise = 0.9 * max(featureDF$Tagwise)) %>% dplyr::mutate(dataset = names(obj))
r I(description)
The following model formulae were used in the dispersion calculations for the
different data sets. Note that if a count matrix or data frame was provided in
place of a DESeqDataSet for some data set, the corresponding design formula is
set to ~1
, thus assuming that all samples are to be treated as replicates.
## Print design information for (ds in names(ddsList)) { cat(ds, ": ", as.character(DESeq2::design(ddsList[[ds]])), "\n") }
if (!quiet) { message("Processing data set dimensions.") }
These bar plots show the number of samples (columns) and features (rows) in each data set.
plots[["nSamples"]] <- ggplot(datasetDF, aes(x = dataset, y = nSamples, fill = dataset)) + geom_bar(stat = "identity", alpha = 0.5) + xlab("") + ylab("Number of samples (columns)") + thm + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) plots[["nSamples"]]
plots[["nVariables"]] <- ggplot(datasetDF, aes(x = dataset, y = nVars, fill = dataset)) + geom_bar(stat = "identity", alpha = 0.5) + xlab("") + ylab("Number of features (rows)") + thm + theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5)) plots[["nVariables"]]
if (!quiet) { message("Processing dispersion estimates.") }
Disperson/BCV plots show the association between the average abundance and the
dispersion or "biological coefficient of variation" (sqrt(dispersion)), as
calculated by
edgeR
[@Robinson2010edgeR] and
DESeq2
[@Love2014DESeq2]. In the edgeR
plot, the estimate of the prior degrees of
freedom is indicated. r I(msg)
The black dots represent the tagwise dispersion estimates, the red line the
common dispersion and the blue curve represents the trended dispersion
estimates. For further information about the dispersion estimation in edgeR
,
see @Chen2014Dispersion.
plots[["BCVedgeR"]] <- ggplot(featureDF %>% dplyr::arrange(AveLogCPMDisp), aes(x = AveLogCPMDisp, y = Tagwise)) + geom_point(size = 0.25, alpha = 0.5) + facet_wrap(~dataset, nrow = colRow[2]) + geom_line(aes(y = Trend), color = "blue", size = 1.5) + geom_line(aes(y = Common), color = "red", size = 1.5) + geom_text(data = datasetDF, aes(label = prior_df)) + xlab("Average log CPM") + ylab("Biological coefficient of variation") + thm plots[["BCVedgeR"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average log CPM and tagwise dispersion", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)
if (calculateStatistics) { makeDF(df = featureDF, column = c("AveLogCPMDisp", "Tagwise"), permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
The black dots are the gene-wise dispersion estimates, the red curve the fitted
mean-dispersion relationship and the blue circles represent the final dispersion
estimates.For further information about the dispersion estimation in DESeq2
,
see @Love2014DESeq2.
plots[["dispersionDESeq2"]] <- ggplot(featureDF %>% dplyr::arrange(baseMeanDisp), aes(x = baseMeanDisp, y = dispGeneEst)) + geom_point(size = 0.25, alpha = 0.5) + facet_wrap(~dataset, nrow = colRow[2]) + scale_x_log10() + scale_y_log10() + geom_point(aes(y = dispFinal), color = "lightblue", shape = 21) + geom_line(aes(y = dispFit), color = "red", size = 1.5) + xlab("Base mean") + ylab("Dispersion") + thm plots[["dispersionDESeq2"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "base mean and genewise dispersion", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)
if (calculateStatistics) { makeDF(df = featureDF, column = c("baseMeanDisp", "dispGeneEst"), permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing mean-variance relationship.") }
These scatter plots show the relation between the empirical mean and variance of
the features. The difference between these mean-variance plots and the
mean-dispersion plots above is that the plots in this section do not take the
information about the experimental design and sample grouping into account, but
simply display the mean and variance of log2(CPM) estimates across all samples,
calculated using the cpm
function from
edgeR
[@Robinson2010edgeR], with a prior count of 2.
plots[["meanVarSepScatter"]] <- ggplot(featureDF, aes(x = average_log2_cpm, y = variance_log2_cpm)) + geom_point(size = 0.75, alpha = 0.5) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Mean of log2(CPM)") + ylab("Variance of log2(CPM)") + thm plots[["meanVarSepScatter"]]
plots[["meanVarOverlaidScatter"]] <- ggplot(featureDF, aes(x = average_log2_cpm, y = variance_log2_cpm, color = dataset)) + geom_point(size = 0.75, alpha = 0.5) + xlab("Mean of log2(CPM)") + ylab("Variance of log2(CPM)") + thm plots[["meanVarOverlaidScatter"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average and variance of the log CPM", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)
if (calculateStatistics) { makeDF(df = featureDF, column = c("average_log2_cpm", "variance_log2_cpm"), permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing library sizes.") }
These plots illustrate the distribution of the total read count per sample, i.e., the column sums of the respective data matrices.
plots[["libsizeSepHist"]] <- ggplot(sampleDF, aes(x = Libsize)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Library size") + thm plots[["libsizeSepHist"]]
plots[["libsizeOverlaidHist"]] <- ggplot(sampleDF, aes(x = Libsize, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Library size") + thm plots[["libsizeOverlaidHist"]]
plots[["libsizeDensity"]] <- ggplot(sampleDF, aes(x = Libsize, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Library size") + thm plots[["libsizeDensity"]]
plots[["libsizeDensityFilled"]] <- ggplot(sampleDF, aes(x = Libsize, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Library size") + thm plots[["libsizeDensityFilled"]]
plots[["libsizeBox"]] <- ggplot(sampleDF, aes(x = dataset, y = Libsize, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Library size") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["libsizeBox"]]
plots[["libsizeViolin"]] <- ggplot(sampleDF, aes(x = dataset, y = Libsize, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Library size") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["libsizeViolin"]]
plots[["libsizeEcdf"]] <- ggplot(sampleDF, aes(x = Libsize, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Library size") + thm plots[["libsizeEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "library size", minvalue = min(sampleDF$Libsize, na.rm = TRUE), maxvalue = max(sampleDF$Libsize, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = sampleDF, column = "Libsize", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing TMM normalization factors.") }
The plots below show the distribution of the TMM normalization factors
[@Robinson2010TMM], intended to adjust for differences in RNA composition, as
calculated by
edgeR
[@Robinson2010edgeR].
plots[["tmmSepHist"]] <- ggplot(sampleDF, aes(x = TMM)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("TMM normalization factor") + thm plots[["tmmSepHist"]]
plots[["tmmOverlaidHist"]] <- ggplot(sampleDF, aes(x = TMM, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("TMM normalization factor") + thm plots[["tmmOverlaidHist"]]
plots[["tmmDensity"]] <- ggplot(sampleDF, aes(x = TMM, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("TMM normalization factor") + thm plots[["tmmDensity"]]
plots[["tmmDensityFilled"]] <- ggplot(sampleDF, aes(x = TMM, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("TMM normalization factor") + thm plots[["tmmDensityFilled"]]
plots[["tmmBox"]] <- ggplot(sampleDF, aes(x = dataset, y = TMM, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("TMM normalization factor") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["tmmBox"]]
plots[["tmmViolin"]] <- ggplot(sampleDF, aes(x = dataset, y = TMM, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("TMM normalization factor") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["tmmViolin"]]
plots[["tmmEcdf"]] <- ggplot(sampleDF, aes(x = TMM, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("TMM normalization factor") + thm plots[["tmmEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "TMM factor", minvalue = min(sampleDF$TMM, na.rm = TRUE), maxvalue = max(sampleDF$TMM, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = sampleDF, column = "TMM", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing effective library sizes.") }
These plots show the distribution of the "effective library sizes", defined as the total count per sample multiplied by the corresponding TMM normalization factor.
plots[["effLibsizeSepHist"]] <- ggplot(sampleDF, aes(x = EffLibsize)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Effective library size") + thm plots[["effLibsizeSepHist"]]
plots[["effLibsizeOverlaidHist"]] <- ggplot(sampleDF, aes(x = EffLibsize, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Effective library size") + thm plots[["effLibsizeOverlaidHist"]]
plots[["effLibsizeDensity"]] <- ggplot(sampleDF, aes(x = EffLibsize, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Effective library size") + thm plots[["effLibsizeDensity"]]
plots[["effLibsizeDensityFilled"]] <- ggplot(sampleDF, aes(x = EffLibsize, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Effective library size") + thm plots[["effLibsizeDensityFilled"]]
plots[["effLibsizeBox"]] <- ggplot(sampleDF, aes(x = dataset, y = EffLibsize, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Effective library size") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["effLibsizeBox"]]
plots[["effLibsizeViolin"]] <- ggplot(sampleDF, aes(x = dataset, y = EffLibsize, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Effective library size") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["effLibsizeViolin"]]
plots[["effLibsizeEcdf"]] <- ggplot(sampleDF, aes(x = EffLibsize, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Effective library size") + thm plots[["effLibsizeEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "effective library size", minvalue = min(sampleDF$EffLibsize, na.rm = TRUE), maxvalue = max(sampleDF$EffLibsize, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = sampleDF, column = "EffLibsize", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing expression distributions.") }
The plots in this section show the distribution of average abundance values for
the features. The abundances are log CPM values calculated by edgeR
.
plots[["logCPMSepHist"]] <- ggplot(featureDF, aes(x = AveLogCPM)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Average log CPM") + thm plots[["logCPMSepHist"]]
plots[["logCPMOverlaidHist"]] <- ggplot(featureDF, aes(x = AveLogCPM, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Average log CPM") + thm plots[["logCPMOverlaidHist"]]
plots[["logCPMDensity"]] <- ggplot(featureDF, aes(x = AveLogCPM, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Average log CPM") + thm plots[["logCPMDensity"]]
plots[["logCPMDensityFilled"]] <- ggplot(featureDF, aes(x = AveLogCPM, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Average log CPM") + thm plots[["logCPMDensityFilled"]]
plots[["logCPMBox"]] <- ggplot(featureDF, aes(x = dataset, y = AveLogCPM, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Average log CPM") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["logCPMBox"]]
plots[["logCPMViolin"]] <- ggplot(featureDF, aes(x = dataset, y = AveLogCPM, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Average log CPM") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["logCPMViolin"]]
plots[["logCPMEcdf"]] <- ggplot(featureDF, aes(x = AveLogCPM, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Average log CPM") + thm plots[["logCPMEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average log CPM", minvalue = min(featureDF$AveLogCPM, na.rm = TRUE), maxvalue = max(featureDF$AveLogCPM, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = featureDF, column = "AveLogCPM", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing fraction zeros per sample.") }
These plots show the distribution of the fraction of zeros observed per sample (column) in the count matrices.
plots[["fraczeroSampleSepHist"]] <- ggplot(sampleDF, aes(x = Fraczero)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Fraction zeros per sample") + thm plots[["fraczeroSampleSepHist"]]
plots[["fraczeroSampleOverlaidHist"]] <- ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Fraction zeros per sample") + thm plots[["fraczeroSampleOverlaidHist"]]
plots[["fraczeroSampleDensity"]] <- ggplot(sampleDF, aes(x = Fraczero, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Fraction zeros per sample") + thm plots[["fraczeroSampleDensity"]]
plots[["fraczeroSampleDensityFilled"]] <- ggplot(sampleDF, aes(x = Fraczero, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Fraction zeros per sample") + thm plots[["fraczeroSampleDensityFilled"]]
plots[["fraczeroSampleBox"]] <- ggplot(sampleDF, aes(x = dataset, y = Fraczero, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Fraction zeros per sample") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["fraczeroSampleBox"]]
plots[["fraczeroSampleViolin"]] <- ggplot(sampleDF, aes(x = dataset, y = Fraczero, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Fraction zeros per sample") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["fraczeroSampleViolin"]]
plots[["fraczeroSampleEcdf"]] <- ggplot(sampleDF, aes(x = Fraczero, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Fraction zeros per sample") + thm plots[["fraczeroSampleEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "fraction zeros", minvalue = min(sampleDF$Fraczero, na.rm = TRUE), maxvalue = max(sampleDF$Fraczero, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = sampleDF, column = "Fraczero", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing fraction zeros per feature.") }
These plots illustrate the distribution of the fraction of zeros observed per feature (row) in the count matrices.
plots[["fraczeroFeatureSepHist"]] <- ggplot(featureDF, aes(x = Fraczero)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Fraction zeros per feature") + thm plots[["fraczeroFeatureSepHist"]]
plots[["fraczeroFeatureOverlaidHist"]] <- ggplot(featureDF, aes(x = Fraczero, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Fraction zeros per feature") + thm plots[["fraczeroFeatureOverlaidHist"]]
plots[["fraczeroFeatureDensity"]] <- ggplot(featureDF, aes(x = Fraczero, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Fraction zeros per feature") + thm plots[["fraczeroFeatureDensity"]]
plots[["fraczeroFeatureDensityFilled"]] <- ggplot(featureDF, aes(x = Fraczero, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Fraction zeros per feature") + thm plots[["fraczeroFeatureDensityFilled"]]
plots[["fraczeroFeatureBox"]] <- ggplot(featureDF, aes(x = dataset, y = Fraczero, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Fraction zeros per feature") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["fraczeroFeatureBox"]]
plots[["fraczeroFeatureViolin"]] <- ggplot(featureDF, aes(x = dataset, y = Fraczero, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Fraction zeros per feature") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["fraczeroFeatureViolin"]]
plots[["fraczeroFeatureEcdf"]] <- ggplot(featureDF, aes(x = Fraczero, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Fraction zeros per feature") + thm plots[["fraczeroFeatureEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "fraction zeros", minvalue = min(featureDF$Fraczero, na.rm = TRUE), maxvalue = max(featureDF$Fraczero, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = featureDF, column = "Fraczero", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing sample-sample correlations.") }
The plots below show the distribution of Spearman correlation coefficients for
pairs of samples, calculated from the log(CPM) values obtained via the cpm
function from edgeR
, with a prior.count of 2. If there are more than
r maxNForCorr
samples in a data set, the pairwise correlations between
r maxNForCorr
randomly selected samples are shown.
plots[["sampleCorrSepHist"]] <- ggplot(sampleCorrDF, aes(x = Correlation)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Sample-sample correlation") + thm plots[["sampleCorrSepHist"]]
plots[["sampleCorrOverlaidHist"]] <- ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Sample-sample correlation") + thm plots[["sampleCorrOverlaidHist"]]
plots[["sampleCorrDensity"]] <- ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Sample-sample correlation") + thm plots[["sampleCorrDensity"]]
plots[["sampleCorrDensityFilled"]] <- ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Sample-sample correlation") + thm plots[["sampleCorrDensityFilled"]]
plots[["sampleCorrBox"]] <- ggplot(sampleCorrDF, aes(x = dataset, y = Correlation, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Sample-sample correlation") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["sampleCorrBox"]]
plots[["sampleCorrViolin"]] <- ggplot(sampleCorrDF, aes(x = dataset, y = Correlation, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Sample-sample correlation") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["sampleCorrViolin"]]
plots[["sampleCorrEcdf"]] <- ggplot(sampleCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Sample-sample correlation") + thm plots[["sampleCorrEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample pair", aspect = "Spearman correlation", minvalue = min(sampleCorrDF$Correlation, na.rm = TRUE), maxvalue = max(sampleCorrDF$Correlation, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = sampleCorrDF, column = "Correlation", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing feature-feature correlations.") }
These plots illustrate the distribution of Spearman correlation coefficients for
pairs of features, calculated from the log(CPM) values obtained via the cpm
function from edgeR
, with a prior.count of 2. Only non-constant features are
considered, and if there are more than r maxNForCorr
such features in a
data set, the pairwise correlations between r maxNForCorr
randomly selected
features are shown.
plots[["featureCorrSepHist"]] <- ggplot(featureCorrDF, aes(x = Correlation)) + geom_histogram(bins = 30) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Feature-feature correlation") + thm plots[["featureCorrSepHist"]]
plots[["featureCorrOverlaidHist"]] <- ggplot(featureCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + geom_histogram(bins = 30, alpha = 0.5, position = "identity") + xlab("Feature-feature correlation") + thm plots[["featureCorrOverlaidHist"]]
plots[["featureCorrDensity"]] <- ggplot(featureCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + geom_line(alpha = 0.5, size = 1.5, stat = "density") + xlab("Feature-feature correlation") + thm plots[["featureCorrDensity"]]
plots[["featureCorrDensityFilled"]] <- ggplot(featureCorrDF, aes(x = Correlation, group = dataset, fill = dataset)) + geom_density(alpha = 0.5) + xlab("Feature-feature correlation") + thm plots[["featureCorrDensityFilled"]]
plots[["featureCorrBox"]] <- ggplot(featureCorrDF, aes(x = dataset, y = Correlation, color = dataset)) + geom_boxplot(outlier.size = -1, alpha = 0.5) + geom_point(position = position_jitter(width = 0.2), size = 2, alpha = 0.5) + ylab("Feature-feature correlation") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["featureCorrBox"]]
plots[["featureCorrViolin"]] <- ggplot(featureCorrDF, aes(x = dataset, y = Correlation, fill = dataset)) + geom_violin(alpha = 0.5) + ylab("Feature-feature correlation") + xlab("") + thm + theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) plots[["featureCorrViolin"]]
plots[["featureCorrEcdf"]] <- ggplot(featureCorrDF, aes(x = Correlation, group = dataset, color = dataset)) + stat_ecdf(size = 1.5, alpha = 0.5) + ylab("Cumulative proportion") + xlab("Feature-feature correlation") + thm plots[["featureCorrEcdf"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature pair", aspect = "Spearman correlation", minvalue = min(featureCorrDF$Correlation, na.rm = TRUE), maxvalue = max(featureCorrDF$Correlation, na.rm = TRUE), permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc)
if (calculateStatistics) { makeDF(df = featureCorrDF, column = "Correlation", permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing library size vs fraction zeros relationship.") }
These scatter plots show the association between the total count (column sums) and the fraction of zeros observed per sample.
plots[["libsizeFraczeroSepScatter"]] <- ggplot(sampleDF, aes(x = Libsize, y = Fraczero)) + geom_point(size = 1, alpha = 0.5) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Library size") + ylab("Fraction zeros") + thm plots[["libsizeFraczeroSepScatter"]]
plots[["libsizeFraczeroOverlaidScatter"]] <- ggplot(sampleDF, aes(x = Libsize, y = Fraczero, color = dataset)) + geom_point(size = 1, alpha = 0.5) + geom_line(stat = "smooth", method = "loess", size = 1, alpha = 0.5) + xlab("Library size") + ylab("Fraction zeros") + thm plots[["libsizeFraczeroOverlaidScatter"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "sample", aspect = "library size and fraction of zeros", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)
if (calculateStatistics) { makeDF(df = sampleDF, column = c("Libsize", "Fraczero"), permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (!quiet) { message("Processing expression vs fraction zeros relationship.") }
These scatter plots show the association between the average abundance and the
fraction of zeros observed per feature. The abundance is defined as the
log(CPM) values as calculated by edgeR
.
plots[["logCPMFraczeroSepScatter"]] <- ggplot(featureDF, aes(x = AveLogCPM, y = Fraczero)) + geom_point(size = 0.75, alpha = 0.5) + facet_wrap(~dataset, nrow = colRow[2]) + xlab("Average log CPM") + ylab("Fraction zeros") + thm plots[["logCPMFraczeroSepScatter"]]
plots[["logCPMFraczeroOverlaidScatter"]] <- ggplot(featureDF, aes(x = AveLogCPM, y = Fraczero, color = dataset)) + geom_point(size = 0.75, alpha = 0.5) + xlab("Average log CPM") + ylab("Fraction zeros") + thm plots[["logCPMFraczeroOverlaidScatter"]]
r I(defineTableDesc(calculateStatistics = calculateStatistics, subsampleSize = subsampleSize, kfrac = kfrac, kmin = kmin, obstype = "feature", aspect = "average log CPM and fraction of zeros", minvalue = 0, maxvalue = 1, permutationPvalues = permutationPvalues, nPermutations = nPermutations, nDatasets = nDatasets)$tabledesc2d)
if (calculateStatistics) { makeDF(df = featureDF, column = c("AveLogCPM", "Fraczero"), permutationPvalues = permutationPvalues, nPermutations = nPermutations, subsampleSize = subsampleSize, kmin = kmin, kfrac = kfrac) }
if (savePlots) { if (!quiet) { message("Saving plots.") } saveRDS(plots, file = paste0(outputDir, "/", tools::file_path_sans_ext(basename(outputFile)), "_ggplots.rds")) }
sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.