#' Summary of productivity
#'
#' @description Shows the percentage of 1. productivity, 2. non-functional +
#' reason for being unproductive,
#' i.e. "Stop Codon" or "Out of frame" or "Stop & Out"
#'
#' @import ggplot2
#'
#' @param dataframes list type. List of sample dataframes
#' @param sampleNames vector type. 1-1 with dataframes
#'
#' @return ggplot2 object
.productivityPlot <- function(dataframes, sampleNames) {
nsamples <- length(dataframes)
if (length(sampleNames) != nsamples) {
stop("Expected equal number of sample names and dataframes, got ",
length(sampleNames),
" samples and ", nsamples, " dataframes.")
}
message("Plotting productivity summary plot for ",
paste(sampleNames, collapse = ", "))
# ---- clean & pre-processing ----
unusedCols <- c("X")
# label each row with sample name and drop unused columns
for (i in seq_len(nsamples)) {
df <- dataframes[[i]]
df$sample <- sampleNames[i]
df[df == "Both"] <- "Stop & Out"
dataframes[[i]] <- df[, !(names(df) %in% unusedCols)]
}
# ---- done: clean & pre-processing ----
# merge all samples
df.union <- do.call("rbind", dataframes)
# plot!
g <- ggplot(df.union, aes(sample, Percentage)) +
geom_bar(stat = "identity", aes(fill = Reason), width = 0.5) +
facet_wrap(~Productivity) +
labs(
title = "Productivity proportions",
subtitle = "Percentage of unproductive reads due to stop codons and frameshifts",
y = "Percentage (%)"
) +
coord_cartesian(ylim = c(0, 100)) +
geom_text(position = position_stack(vjust = 0.5),
aes(label = sprintf("%0.2f%%", Percentage),
group = Reason)) +
theme(axis.text.x = element_text(angle = 55, hjust = 1),
legend.title = element_blank(), axis.title.x = element_blank())
return(g)
}
#' Plots a distribution plot for different productivity analysis files
#'
#' @description A wrapper for plotDist
#'
#' @include util.R
#' @include distributions.R
#' @import ggplot2
#'
#' @param productivityDirectories vector type.
#' directories where all productivity csv
#' files lives (usually <samplename>/productivity/)
#' @param sampleNames vector type.
#' @param title string type.
#' @param reg string type. Regular expression to find the right files
#' for this particular distribution plot
#' @param outputFileName string type.
#' Vector of file names to save in the order of \code{regions}
#' @param region string type. Most of the dist plots are
#' regional based. use "" if no regions are involved
#' @param .save logical type. Save Rdata?
#'
#' @return None
.prodDistPlot <-
function(productivityDirectories,
sampleNames, title, reg, outputFileName,
region, .save = TRUE) {
fs <- .listFilesInOrder(path = productivityDirectories,
pattern = paste0(".*", region, reg))
if (length(fs)) {
dataframes <- lapply(fs, read.csv,
stringsAsFactors = FALSE, skip = 1)
plotTitle <- paste(title, toupper(region), "in",
paste(sampleNames, collapse = ", "))
subtitle <-
paste("Total is" , paste(lapply(fs, function(x) {
as.integer(.getTotal(x))
}), collapse = ", "))
g <- .plotDist(dataframes,
sampleNames,
plotTitle,
.checkVert(fs[[1]]),
subs = subtitle)
ggsave(outputFileName, plot = g, width = V_WIDTH, height = V_HEIGHT)
.saveAs(.save, outputFileName, g)
}
}
#' Plots the error distribution for each region: CDRs, FRs, IGV, IGD, and IGJ
#'
#' @description Plots the distribution of indels (gaps), indels in
#' out-of-frame sequences, and the distribution of mismatches for CDRs,
#' FRs, IGV, IGD, and IGJ.
#'
#' @include util.R
#' @include distributions.R
#' @import ggplot2
#'
#' @param productivityDirectories list type. List of directories
#' @param prodOut string type. Output directory
#' @param sampleNames vector type. 1-1 with productivity directories
#' @param combinedNames string type. Title friendly "combined" sample names
#' @param mashedNames string type. File friendly "mashed-up" sample names
#' @param .save logical type. Save Rdata?
#'
#' @return None
.plotErrorDist <- function(productivityDirectories, prodOut,
sampleNames, combinedNames,
mashedNames, .save = TRUE) {
regions <-
c("cdr1", "cdr2", "cdr3", "fr1", "fr2", "fr3", "igv", "igd", "igj")
# gaps_dist plots only ---------------------------------------------------
message("Plotting indel distributions for ", combinedNames)
lapply(regions, function(region) {
.prodDistPlot(
productivityDirectories,
sampleNames,
"Gaps in",
"_gaps_dist\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames, "_",
region,
"_gaps_dist.png")),
region, .save = .save)
})
# gaps_out_of_frame plots only (no igv, igd, ihj plots for this) ---------
subregions <- head(regions, n = 6)
message("Plotting indel out of frame distributions for ", combinedNames)
lapply(subregions, function(region) {
.prodDistPlot(
productivityDirectories,
sampleNames,
"Gaps in",
"_gaps_dist_out_of_frame\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames, "_", region,
"_gaps_dist_out_of_frame.png")),
region, .save = .save)
})
# mismatch dist only -----------------------------------------------------
message("Plotting mismatches distributions for ", combinedNames)
lapply(regions, function(region) {
.prodDistPlot(
productivityDirectories,
sampleNames,
"Mismatches in",
"_mismatches_dist\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames,
"_", region, "_mismatches_dist.png")),
region, .save = .save)
})
}
#' Plots the error distribution for IGV germlines
#'
#' @description Plots the distribution of in-frame unproductive,
#' out-of-frame unproductive, and productive IGV germlines.
#'
#' @include util.R
#' @include distributions.R
#' @import ggplot2
#'
#' @param productivityDirectories list type. List of directories
#' @param prodOut string type. Output directory
#' @param sampleNames vector type. 1-1 with productivity directories
#' @param combinedNames string type. Title friendly "combined" sample names
#' @param mashedNames string type. File friendly "mashed-up" sample names
#' @param .save logical type, save Rdata?
#'
#' @return None
.plotIGVErrors <- function(productivityDirectories, prodOut, sampleNames,
combinedNames, mashedNames, .save = TRUE) {
# in frame unprod ----------
message("Plotting inframe unproductive distributions for ", combinedNames)
.prodDistPlot(
productivityDirectories,
sampleNames,
"Abundance of In-frame Unproductive Clones in",
"_dist_inframe_unproductive\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames,
"_igv_dist_inframe_unproductive.png")),
"igv", .save = .save)
# out of frame unprod --------
message("Plotting out of frame clones distributions for ", combinedNames)
.prodDistPlot(
productivityDirectories,
sampleNames,
"Abundance of Out-Of-Frame Clones in",
"_dist_out_of_frame\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames, "_igv_dist_out_of_frame.png")),
"igv", .save = .save)
# productive ----------------
message("Plotting out IGV productive distributions for ", combinedNames)
.prodDistPlot(
productivityDirectories,
sampleNames,
"Abundance of Productive Clones in",
"_dist_productive\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames, "_igv_dist_productive.png")),
"igv", .save = .save)
}
#' Conducts productivty analysis
#'
#' @include util.R
#' @include distributions.R
#' @import ggplot2
#'
#' @param productivityDirectories list type. List of directories
#' @param prodOut string type. Output directory
#' @param sampleNames vector type. 1-1 with productivity directories
#' @param combinedNames string type. Title friendly "combined" sample names
#' @param mashedNames string type. File friendly "mashed-up" sample names
#' @param .save logical type. Save Rdata
#'
#' @return None
.productivityAnalysis <- function(productivityDirectories,
prodOut,
sampleNames,
combinedNames,
mashedNames,
.save = TRUE) {
message("Commencing productivity analysis")
# summary productivity file
prodFiles <- .listFilesInOrder(path = productivityDirectories,
pattern = ".*_productivity\\.csv(\\.gz)?$")
if (length(prodFiles)) {
g <- .productivityPlot(lapply(prodFiles, read.csv,
stringsAsFactors = FALSE),
sampleNames)
saveName <- file.path(prodOut, paste0(mashedNames, "_productivity.png"))
ggsave(saveName, plot = g, width = V_WIDTH, height = V_HEIGHT)
.saveAs(.save, saveName, g)
}
# plot the distribution of indels and mismatches -------------------------
.plotErrorDist(productivityDirectories, prodOut,
sampleNames, combinedNames, mashedNames, .save = .save)
# stop codon dist plot ---------------------------------------------------
message("Plotting stop codon in-frame distributions for ", combinedNames)
.prodDistPlot(
productivityDirectories,
sampleNames,
"Stop codon in In-frame Clones",
"_stopcodon_dist_in_frame\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames, "_stopcodon_dist_in_frame.png")),
"", .save = .save) # no regions
# vjframe plot -----------------------------------------------------------
message("Plotting vjframe distributions for ", combinedNames)
.prodDistPlot(
productivityDirectories,
sampleNames,
"V-D-J Rearrangement",
"_vjframe_dist\\.csv(\\.gz)?$",
file.path(prodOut, paste0(mashedNames, "_vjframe_dist.png")),
"", .save = .save) # no regions
# 3 special cases for IGV region -----------------------------------------
# igv - inframe_unproductive, out of frame, productive
.plotIGVErrors(productivityDirectories, prodOut,
sampleNames, combinedNames, mashedNames, .save = .save)
# stop codon in FR/CDR region proportion plot ----------------------------
message("Plotting stop codon region distributions for ", combinedNames)
lapply(c("inframe", "outframe"), function(framestat) {
stopCodonRegionFiles <-
.listFilesInOrder(
path = productivityDirectories,
pattern = paste0(".*_stopcodon_region_",
framestat, "\\.csv(\\.gz)?$")
)
if (length(stopCodonRegionFiles)) {
vert = .checkVert(stopCodonRegionFiles[[1]])
subtitle <- paste("Total is",
paste(
lapply(stopCodonRegionFiles, function(x) {
as.integer(.getTotal(x))
}),
collapse = ", "
))
stopcodonRegion <- .plotDist(
lapply(stopCodonRegionFiles, read.csv, skip = 1),
sampleNames,
paste("Stop codon in FRs and CDRs of",
if (framestat == "inframe") "In-frame" else "Out-of-frame",
"sequences in",
combinedNames),
vert,
subs = subtitle,
sorted = FALSE)
saveName <- file.path(prodOut, paste0(mashedNames,
"_stopcodon_region_",
framestat, ".png"))
if (vert) {
ggsave(saveName, plot = stopcodonRegion,
width = V_WIDTH, height = V_HEIGHT)
} else {
ggsave(saveName, plot = stopcodonRegion,
width = H_WIDTH, height = H_HEIGHT)
}
.saveAs(.save, saveName, stopcodonRegion)
} else {
warning("Could not find stop codon region distribution plot ",
"for samples ", combinedNames)
}
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.