#' @title CumulativePlots
#'
#' @description Summarizes and plots the values for counts around positions of interest in the genome.
#'
#' @details This function plots summarized (default = mean) values of pre-calculated read counts around the positions of interest in the genome, for example summits of ChIP peaks.
#'
#' @param counts A list of counts across peak summits generated by SummitHeatmap.
#' @param bamNames Character vector containing the names to describe the \code{bamFiles} you are using (for example: "H3K9me3_reads").
#' If no names are supplied, the names of the \code{counts} list are used.
#' @param span The distance from the peak summit to the left and right that you want your heatmap to extand to. Default is 2025.
#' @param step The window size in which reads are counted/plotted in the heatmap. Default is 50.
#' @param summarizing The function to summarize the heatmaps across peaks to achieve a cumulative line. Default is "mean".
#' @param overlapNames A character vector of the row names of the counts that overlap with a given annotation, for example, transcription start sites.
#' @param confInterval The confidence interval around the mean that will be calculated and plotted. Default is .95. Only used if summarizing = "mean".
#' @param plot If TRUE (default), returns the cumulative plots, otherwise returns the underlying data table.
#' @param plotcols Two different colors for the lines that contain the values for the peaks defined in overlapNames, and the rest of the peaks.
#' @param overlapLabels Two different labels for the lines that contain the values for the peaks defined in overlapNames, and the rest of the peaks.
#'
#' @return Returns a matrix with mean values of the log2 of supplied counts (with pseudocount of 1 added) in each bin for each sample, split by overlap_names.
#' Plots of mean values for read counts around position of interest in the genome.
#' One plot for each count matrix supplied. All plots are arranged on one page and saved as a png.
#'
#' @importFrom reshape2 melt
#' @importFrom ggplot2 ggplot
#' @importFrom ggplot2 geom_line
#' @importFrom ggplot2 geom_smooth
#' @importFrom ggplot2 facet_wrap
#' @importFrom ggplot2 scale_fill_manual
#' @importFrom ggplot2 scale_color_manual
#' @importFrom ggplot2 theme_classic
#' @importFrom ggplot2 aes vars ylab
#' @importFrom tidyr pivot_longer
#' @importFrom tidyr %>%
#' @importFrom tidyselect contains
#' @importFrom stats sd
#' @importFrom stats qt
#' @importFrom rlang .data
#'
#' @examples
#' counts <- list(matrix(rnorm(21000,2,1),ncol=81,nrow=100,dimnames=list(1:100,-40:40)),
#' matrix(rnorm(21000,2,1),ncol=81,nrow=100,dimnames=list(1:100,-40:40)))
#' bamNames <- c("counts1","counts2")
#' names(counts) <- bamNames
#' CumulativePlots(counts,bamNames=bamNames,overlapNames="NA")
#'
#' @export
CumulativePlots <- function(counts, bamNames=names(counts), span=2025, step=50,
summarizing="mean", overlapNames="NA", plot=TRUE,
confInterval=.95, plotcols=c("violet","darkgrey"),
overlapLabels=c("overlap", "no overlap")){
# Input validation
if (!is.list(counts)) {
stop("'counts' must be a list of matrices")
}
if (length(counts) == 0) {
stop("'counts' list cannot be empty")
}
# Check that each element in counts is a matrix
for (i in seq_along(counts)) {
if (!is.matrix(counts[[i]])) {
stop(paste0("Element ", i, " in 'counts' list is not a matrix"))
}
}
# Ensure all matrices have the same dimensions
first_ncol <- ncol(counts[[1]])
first_nrow <- nrow(counts[[1]])
for (i in seq_along(counts)[-1]) {
if (ncol(counts[[i]]) != first_ncol) {
stop(paste0("Matrix ", i, " has ", ncol(counts[[i]]), " columns, but matrix 1 has ", first_ncol, " columns. All matrices should have the same number of columns."))
}
if (nrow(counts[[i]]) != first_nrow) {
stop(paste0("Matrix ", i, " has ", nrow(counts[[i]]), " rows, but matrix 1 has ", first_nrow, " rows. All matrices should have the same number of rows."))
}
}
# Check bamNames
if (length(bamNames) != length(counts)) {
stop(paste0("Length of 'bamNames' (", length(bamNames), ") must match length of 'counts' (", length(counts), ")"))
}
# Check span and step
if (!is.numeric(span) || span <= 0) {
stop("'span' must be a positive number")
}
if (!is.numeric(step) || step <= 0) {
stop("'step' must be a positive number")
}
if (step > span) {
warning("'step' is larger than 'span', which may lead to unexpected results")
}
# Check summarizing function
valid_summarizing <- c("mean", "median", "sum", "min", "max")
if (!(summarizing %in% valid_summarizing) && !is.function(summarizing)) {
stop(paste0("'summarizing' must be one of: ", paste(valid_summarizing, collapse=", "), " or a custom function"))
}
# Check confInterval
if (!is.numeric(confInterval) || confInterval < 0 || confInterval > 1) {
stop("'confInterval' must be a number between 0 and 1")
}
# Check plot
if (!is.logical(plot)) {
stop("'plot' must be logical (TRUE or FALSE)")
}
# Check plotcols
if (length(plotcols) != 2) {
stop("'plotcols' must be a vector of exactly 2 colors")
}
# Check overlapLabels
if (length(overlapLabels) != 2) {
stop("'overlapLabels' must be a vector of exactly 2 labels")
}
# Check for required packages
required_packages <- c("reshape2", "ggplot2", "tidyr", "tidyselect", "stats", "rlang")
missing_packages <- required_packages[!sapply(required_packages, requireNamespace, quietly = TRUE)]
if (length(missing_packages) > 0) {
stop(paste0("The following required packages are not installed: ",
paste(missing_packages, collapse=", "),
". Please install them before using this function."))
}
# Continue with function logic
nwindows <- ceiling((span*2)/step)
if(nwindows %% 2 == 0){
nwindows <- nwindows + 1
}
regionwidth <- step * nwindows
windows <- seq(from=0, to=regionwidth-step, by=step)
binmids <- windows - regionwidth/2 + step/2
avg.counts <- lapply(1:length(bamNames), matrix, nrow=0, ncol=0)
if (summarizing == "mean"){
for (bam.sample in seq_along(bamNames)){
# Check for NA or Inf values in the counts matrix
if (any(is.na(counts[[bam.sample]]))) {
warning(paste0("NA values found in counts matrix for sample '", bamNames[bam.sample], "'. These will be propagated through calculations."))
}
if (any(is.infinite(counts[[bam.sample]]))) {
warning(paste0("Infinite values found in counts matrix for sample '", bamNames[bam.sample], "'. These may lead to unexpected results."))
}
mycounts <- log2(counts[[bam.sample]]+1)
mycounts1 <- mycounts[row.names(mycounts) %in% overlapNames,]
mycounts2 <- mycounts[row.names(mycounts) %in% overlapNames == FALSE,]
# Check if any matrices are empty after filtering
if (nrow(mycounts1) == 0) {
warning(paste0("No rows match the overlapNames criteria for sample '", bamNames[bam.sample], "'. Using default values for overlap calculations."))
# Set default values to avoid errors
mycounts1 <- matrix(0, nrow=1, ncol=ncol(mycounts))
}
if (nrow(mycounts2) == 0) {
warning(paste0("All rows match the overlapNames criteria for sample '", bamNames[bam.sample], "'. Using default values for non-overlap calculations."))
# Set default values to avoid errors
mycounts2 <- matrix(0, nrow=1, ncol=ncol(mycounts))
}
#calculate the mean, and the standard error for each binmids position across all peaks
avg.cts <- data.frame(mean_overlap1=apply(mycounts1, 2, mean),
mean_overlap2=apply(mycounts2, 2, mean),
se_overlap1=apply(mycounts1, 2, sd)/sqrt(nrow(mycounts1)),
se_overlap2=apply(mycounts2, 2, sd)/sqrt(nrow(mycounts2)),
position=binmids, name=bamNames[bam.sample])
#calculate the confidence intervals for each binmids position across all peaks
#reference: http://www.cookbook-r.com/Manipulating_data/Summarizing_data/
# Confidence interval multiplier for standard error
# Calculate t-statistic for confidence interval:
# e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
# Handle single-row edge cases for t-distribution
if (nrow(mycounts1) > 1) {
ciMult1 <- qt(confInterval/2 + .5, nrow(mycounts1)-1)
} else {
ciMult1 <- 0
warning(paste0("Only one row in overlap group for sample '", bamNames[bam.sample], "'. Confidence intervals will be equal to the mean."))
}
if (nrow(mycounts2) > 1) {
ciMult2 <- qt(confInterval/2 + .5, nrow(mycounts2)-1)
} else {
ciMult2 <- 0
warning(paste0("Only one row in non-overlap group for sample '", bamNames[bam.sample], "'. Confidence intervals will be equal to the mean."))
}
avg.cts$ci.upper_overlap1 <- avg.cts$mean_overlap1 + avg.cts$se_overlap1 * ciMult1
avg.cts$ci.lower_overlap1 <- avg.cts$mean_overlap1 - avg.cts$se_overlap1 * ciMult1
avg.cts$ci.upper_overlap2 <- avg.cts$mean_overlap2 + avg.cts$se_overlap2 * ciMult2
avg.cts$ci.lower_overlap2 <- avg.cts$mean_overlap2 - avg.cts$se_overlap2 * ciMult2
#add the data frame into the list of data frames for each bam sample
avg.counts[[bam.sample]] <- avg.cts
}
#collapse the list into one long data frame with a column indication the bam sample name
avg.counts <- do.call("rbind", avg.counts)
if(plot==TRUE){
# Check if tidyr is available for pivot_longer
if (!requireNamespace("tidyr", quietly = TRUE)) {
stop("Package 'tidyr' is required for plotting. Please install it.")
}
tryCatch({
mean.plots.long <- avg.counts %>% tidyr::pivot_longer(tidyselect::contains("overlap"),
names_to = c(".value", "overlap"),
names_sep="_")
p <- ggplot2::ggplot(mean.plots.long, ggplot2::aes(x=.data$position, y=.data$mean)) +
ggplot2::geom_smooth(ggplot2::aes(ymin=.data$ci.lower, ymax=.data$ci.upper,
fill=.data$overlap, color=.data$overlap), stat="identity")
p <- p + ggplot2::facet_wrap(ggplot2::vars(.data$name), scales="free") +
ggplot2::ylab("log2(cpm)") + ggplot2::theme_classic()
p <- p + ggplot2::scale_fill_manual(values=plotcols, labels=overlapLabels)
p <- p + ggplot2::scale_color_manual(values=plotcols, labels=overlapLabels)
return(p)
}, error = function(e) {
stop(paste0("Error in creating plot: ", e$message))
})
} else {
return(avg.counts)
}
} else {
# For other summarizing functions
for (bam.sample in seq_along(bamNames)){
mycounts <- log2(counts[[bam.sample]]+1)
mycounts1 <- mycounts[row.names(mycounts) %in% overlapNames,]
mycounts2 <- mycounts[row.names(mycounts) %in% overlapNames == FALSE,]
# Check if any matrices are empty after filtering
if (nrow(mycounts1) == 0) {
warning(paste0("No rows match the overlapNames criteria for sample '", bamNames[bam.sample], "'. Using default values for overlap calculations."))
# Set default values to avoid errors
mycounts1 <- matrix(0, nrow=1, ncol=ncol(mycounts))
}
if (nrow(mycounts2) == 0) {
warning(paste0("All rows match the overlapNames criteria for sample '", bamNames[bam.sample], "'. Using default values for non-overlap calculations."))
# Set default values to avoid errors
mycounts2 <- matrix(0, nrow=1, ncol=ncol(mycounts))
}
# Handle custom function or named function
if (is.function(summarizing)) {
tryCatch({
overlap_result <- apply(mycounts1, 2, summarizing)
no_overlap_result <- apply(mycounts2, 2, summarizing)
}, error = function(e) {
stop(paste0("Error applying custom summarizing function: ", e$message))
})
} else {
# For named functions like "median", "sum", etc.
summarizing_fn <- match.fun(summarizing)
tryCatch({
overlap_result <- apply(mycounts1, 2, summarizing_fn)
no_overlap_result <- apply(mycounts2, 2, summarizing_fn)
}, error = function(e) {
stop(paste0("Error applying summarizing function '", summarizing, "': ", e$message))
})
}
avg.cts <- data.frame(overlap.mean = overlap_result,
no.overlap.mean = no_overlap_result,
position = binmids,
name = bamNames[bam.sample])
# Add the data frame into the list of data frames for each bam sample
avg.counts[[bam.sample]] <- avg.cts
}
# Collapse the list into one long data frame with a column indicating the bam sample name
avg.counts <- do.call("rbind", avg.counts)
if (plot == TRUE) {
# Check if reshape2 is available for melt
if (!requireNamespace("reshape2", quietly = TRUE)) {
stop("Package 'reshape2' is required for plotting. Please install it.")
}
tryCatch({
avg.counts.long <- reshape2::melt(avg.counts, id.vars=c("name", "position"))
p <- ggplot2::ggplot(avg.counts.long, ggplot2::aes(x = .data$position, y = .data$value)) +
ggplot2::geom_line(ggplot2::aes(color = .data$variable)) +
ggplot2::facet_wrap(ggplot2::vars(.data$name), scales = "free")
p <- p + ggplot2::theme_classic() + ggplot2::ylab("log2(cpm)") +
ggplot2::scale_color_manual(values = plotcols, labels = overlapLabels)
return(p)
}, error = function(e) {
stop(paste0("Error in creating plot: ", e$message))
})
} else {
return(avg.counts)
}
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.