#' @title SummitHeatmap
#'
#' @description Generate heatmaps of read counts around positions of interest in a genome.
#'
#' @details This function generates heatmaps of different experiments (read counts) around positions of interest in the genome,
#' for example the summits of ChIP peaks, or transcription start sites. The center of your GRanges object defines the coordinates
#' of these positions of interest (eg peak summits, TSSs). Features that lie on the minus strand will be reversed in the final output.
#'
#' @param peaks A GRanges object containing your regions of interest. Must include seqnames (chromosomes), start, end, strand, and name.
#' @param bamFiles Character vector containing the filenames (including the full path) of read alignment files in bam format.
#' @param bamNames Character vector containing the names to describe the \code{bamFiles} you are using (for example: "H3K9me3_reads").
#' If no names are supplied, the full \code{bamFiles} names are used.
#' @param span Integer scalar specifyig the distance from the peak center to the left and right that you want your heatmap to extend to. Default is 2025.
#' @param step Integer scalar specifyig the window size in which reads are counted. Default is 50.
#' @param useCPM Logical scalar indicationg wether to normalize the number of reads per window to the total number of reads in the sample (bam file). Default is TRUE.
#' @param PairedEnd Logical scalar, indicating wether reads in bam files were generated with paired-end or single-end sequencing. Default is FALSE (=single-end).
#' @param minMQS Integer scalar, specifying the minimum mapping quality that a read must have to be included. Default is 255, which eliminates multimapping
#' reads in case the STAR aligner was used to generate the \code{bamFiles}.
#' @param strand Integer vector indicating if strand-specific read counting should be performed. Length of the vector should be either 1
#' (meaning that the value is applied to all input files), or equal to the total number of input files provided. Each
#' vector element should have one of the following three values: 0 (unstranded), 1 (stranded) and 2 (reversely stranded).
#' Default value of this parameter is 0 (ie. unstranded read counting is performed for all input files).
#' @param splitOnly Logical scalar indicating whether only split alignments (their CIGAR strings contain letter 'N') should be included. FALSE by default.
#' @param nonSplitOnly Logical scalar indicating whether only non-split alignments (their CIGAR strings do not contain letter 'N') should be included. FALSE by default.
#' @param minOverlap Integer scalar giving the minimum number of overlapping bases required for assigning a read to a heatmap window.
#' For assignment of read pairs, number of overlapping bases from each read in the same pair will be summed.
#' If a negative value is provided, then a gap of up to specified size will be allowed between read and the feature that the read is assigned to. 1 by default.
#' @param readExtension3 Integer scalar giving the number of bases extended downstream from 3' end of each read. 0 by default. Negative value is not allowed.
#' @param readShiftSize Integer scalar specifying the number of bases the reads will be shifted downstream by. 0 by default. Negative value is not allowed.
#' In case of mode = "Q" and PairedEnd = TRUE, it should be set to 'halfInsert' to shift each read by half the fragment size.
#' @param requireBothEndsMapped Logical scalar indicating if both ends from the same fragment are required to be successfully aligned before the fragment can be assigned to a feature or meta-feature.
#' This parameter is only appliable when PairedEnd is TRUE.
#' @param read2pos Specifying whether each read should be reduced to its 5' most base or 3' most base. It has three possible values: NULL, 5 (denoting 5' most base) and 3 (denoting 3' most base).
#' Default value is 5, ie. only the 5' end of the read will be counted. If a read is reduced to a single base, only that base will be considered for the read assignment.
#' Read reduction is performed after read shifting and extension.
#' @param mode Specify if you want to use QuasR (mode = "Q") or FeatureCounts (mode = "F", default) to count the number of rads per window.
#' @param genome The reference genome, either a string specifying a BSgenome object or the name of a genome fasta file. Default is "BSgenome.Mmusculus.UCSC.mm10".
#'
#' @return A list of matrices of the same length as the number of samples supplied in \code{bamFiles}.
#' Each matrix contains the number of reads (or cpm) in each window (column headers indicate the middle of the window)
#' around the center of each region provided in \code{peaks}.
#'
#' @examples
#' peaks <- SimulatePeaks(1000,rep(100,1000),chromosomeSizes=
#' system.file("extdata", "chrNameLength_mm10_chr11.txt", package = "MiniChip"))
#' #peaks <- GenomicRanges::GRanges(
#' #seqnames = Rle(c("chr1", "chr2", "chr1", "chr3"), c(1, 3, 2, 4)),
#' #ranges = IRanges(50101:50110, end = 51111:51120),
#' #strand = Rle(strand(c("-", "+", "*", "+", "-")), c(1, 2, 2, 3, 2)),
#' #summit = 1:10, name = head(letters, 10))
#' bamFiles <- list.files(system.file("extdata", package = "MiniChip"),
#' full.names=TRUE,pattern="*bam$")
#' SummitHeatmap(peaks=peaks,bamFiles=bamFiles)
#'
#' @importFrom GenomicRanges start
#' @importFrom GenomicRanges end
#' @importFrom GenomicRanges resize
#' @importFrom GenomicRanges width
#' @importFrom GenomicRanges slidingWindows
#' @importFrom GenomicRanges strand
#' @importFrom GenomicRanges strand<-
#' @importFrom GenomicRanges elementMetadata
#' @importFrom GenomicRanges seqnames
#' @importFrom GenomicRanges GRanges
#' @importFrom Rsubread featureCounts
#' @importFrom parallel makeCluster
#' @importFrom QuasR qAlign
#' @importFrom QuasR qProfile
#' @importFrom QuasR alignmentStats
#'
#' @export
SummitHeatmap <- function(peaks, bamFiles, bamNames="myreads", span=2025, step=50, minOverlap = 1,
useCPM=TRUE, PairedEnd=FALSE, minMQS=255, strand=0, splitOnly=FALSE, nonSplitOnly=FALSE,
readExtension3=0, readShiftSize=0, requireBothEndsMapped=FALSE, read2pos=5, mode="F",
genome="BSgenome.Mmusculus.UCSC.mm10"){
# Input validation
# Check if peaks is a GRanges object
if (!inherits(peaks, "GRanges")) {
stop("'peaks' must be a GRanges object")
}
# Check if bamFiles is provided and is a character vector
if (missing(bamFiles)) {
stop("'bamFiles' parameter is required")
}
if (!is.character(bamFiles)) {
stop("'bamFiles' must be a character vector containing file paths")
}
# Check if all BAM files exist
non_existing_files <- bamFiles[!file.exists(bamFiles)]
if (length(non_existing_files) > 0) {
stop(paste0("The following BAM files don't exist: ",
paste(non_existing_files, collapse=", ")))
}
# Check if bamNames has the same length as bamFiles
if (length(bamNames) != 1 && length(bamNames) != length(bamFiles)) {
stop(paste0("'bamNames' must have the same length as 'bamFiles' or be a single value. ",
"Length of bamFiles: ", length(bamFiles),
", length of bamNames: ", length(bamNames)))
}
# If bamNames is a single value and bamFiles has multiple files, replicate bamNames
if (length(bamNames) == 1 && length(bamFiles) > 1) {
bamNames <- rep(bamNames, length(bamFiles))
}
# Check numeric parameters
if (!is.numeric(span) || span <= 0) {
stop("'span' must be a positive numeric value")
}
if (!is.numeric(step) || step <= 0) {
stop("'step' must be a positive numeric value")
}
if (!is.numeric(minOverlap)) {
stop("'minOverlap' must be a numeric value")
}
if (!is.numeric(minMQS) || minMQS < 0) {
stop("'minMQS' must be a non-negative numeric value")
}
if (!is.numeric(readExtension3) || readExtension3 < 0) {
stop("'readExtension3' must be a non-negative numeric value")
}
if (!is.numeric(readShiftSize) || readShiftSize < 0) {
if (!(mode == "Q" && PairedEnd && readShiftSize == "halfInsert")) {
stop("'readShiftSize' must be a non-negative numeric value or 'halfInsert' (only when mode='Q' and PairedEnd=TRUE)")
}
}
# Check boolean parameters
if (!is.logical(useCPM)) {
stop("'useCPM' must be a logical value (TRUE or FALSE)")
}
if (!is.logical(PairedEnd)) {
stop("'PairedEnd' must be a logical value (TRUE or FALSE)")
}
if (!is.logical(splitOnly)) {
stop("'splitOnly' must be a logical value (TRUE or FALSE)")
}
if (!is.logical(nonSplitOnly)) {
stop("'nonSplitOnly' must be a logical value (TRUE or FALSE)")
}
if (!is.logical(requireBothEndsMapped)) {
stop("'requireBothEndsMapped' must be a logical value (TRUE or FALSE)")
}
# Check if splitOnly and nonSplitOnly are not both TRUE
if (splitOnly && nonSplitOnly) {
stop("'splitOnly' and 'nonSplitOnly' cannot both be TRUE")
}
# Check requireBothEndsMapped only applicable when PairedEnd is TRUE
if (requireBothEndsMapped && !PairedEnd) {
warning("'requireBothEndsMapped' is only applicable when PairedEnd=TRUE. Setting will be ignored.")
}
# Check strand parameter
if (!is.numeric(strand) || !all(strand %in% c(0, 1, 2))) {
stop("'strand' must be one of the following values: 0 (unstranded), 1 (stranded), or 2 (reversely stranded)")
}
if (length(strand) != 1 && length(strand) != length(bamFiles)) {
stop(paste0("Length of 'strand' must be either 1 or equal to the number of BAM files. ",
"Current length: ", length(strand), ", number of BAM files: ", length(bamFiles)))
}
# Check read2pos parameter
if (!is.null(read2pos) && !(read2pos %in% c(5, 3))) {
stop("'read2pos' must be NULL, 5, or 3")
}
# Check mode parameter
if (!(mode %in% c("F", "Q"))) {
stop("'mode' must be either 'F' (FeatureCounts) or 'Q' (QuasR)")
}
# Check if QuasR is installed if mode is "Q"
if (mode == "Q" && !requireNamespace("QuasR", quietly = TRUE)) {
stop("'QuasR' package is required when mode='Q'. Please install it with BiocManager::install('QuasR')")
}
# Check if Rsubread is installed if mode is "F"
if (mode == "F" && !requireNamespace("Rsubread", quietly = TRUE)) {
stop("'Rsubread' package is required when mode='F'. Please install it with BiocManager::install('Rsubread')")
}
# Now proceed with the original function logic
if (is.null(names(peaks)) && !is.null(peaks$name)) {
names(peaks) <- peaks$name
}
if (is.null(strand(peaks)) && !is.null(peaks$strand)) {
strand(peaks) <- peaks$strand
}
# Check if names are available for peaks
if (is.null(names(peaks))) {
warning("No names found for peaks. Using automatically generated names.")
names(peaks) <- paste0("peak_", seq_along(peaks))
}
# Check if strand information is available
if (all(is.na(strand(peaks)))) {
warning("No strand information found for peaks. Setting all strands to '*'.")
strand(peaks) <- "*"
}
if (mode == "Q") { # use QUASR version
#write a table to read in samples for QUASR
write.table(data.frame(FileName=bamFiles, SampleName=bamNames), file="QUASR.txt", sep="\t", col.names=TRUE, row.names=FALSE, append=FALSE, quote=FALSE)
#translate options
cl <- tryCatch({
makeCluster(20)
}, error = function(e) {
warning("Could not create cluster with 20 cores. Using 1 core instead.")
makeCluster(1)
})
Qpaired <- ifelse(PairedEnd == TRUE, "fr", "no")
selectReadPosition <- ifelse(read2pos == 3, "end", "start")
orientation <- ifelse(strand == 1, "same", ifelse(strand == 2, "opposite", "any"))
#generate project
tryCatch({
proj <- qAlign("QUASR.txt", genome, paired = Qpaired, clObj = cl)
}, error = function(e) {
stop(paste0("Error in qAlign: ", conditionMessage(e),
"\nMake sure 'genome' parameter is valid and QuasR is properly configured."))
})
#generate counts matrices
tryCatch({
prf <- qProfile(proj, resize(peaks, width = 1L, fix = "center"), upstream=span, binSize = step,
selectReadPosition= selectReadPosition, orientation = orientation, shift = readShiftSize,
useRead="any", clObj = cl, mapqMin = minMQS)[-1]
}, error = function(e) {
stop(paste0("Error in qProfile: ", conditionMessage(e)))
})
if (useCPM == TRUE) {
#normalize to million reads (cpm)
normfactor <- alignmentStats(proj)[,"mapped"]/1000000
prfcpm <- prf
for (i in seq_along(prf)) {
prfcpm[[i]] <- prf[[i]]/normfactor[i]
}
unlink("QUASR.txt")
return(prfcpm)
} else {
unlink("QUASR.txt")
return(prf)
}
}
if (mode == "F") { #use Feature Counts version
#take the middle of the GRanges region, then define whole heatmap region
nwindows <- ceiling((span*2)/step)
if (nwindows %% 2 == 0) {
nwindows <- nwindows + 1
}
regionwidth <- step * nwindows
# Try to resize peaks and catch potential errors
tryCatch({
peaks <- resize(peaks, width=regionwidth, fix="center")
}, error = function(e) {
stop(paste0("Error resizing peaks: ", conditionMessage(e),
"\nMake sure your peaks GRanges object is properly constructed."))
})
#remove peaks with negative start values
original_peak_count <- length(peaks)
peaks <- peaks[start(peaks) >= 0 & width(peaks) == regionwidth]
if (length(peaks) < original_peak_count) {
warning(paste0(original_peak_count - length(peaks),
" peaks were removed because they had negative start positions or incorrect width."))
}
if (length(peaks) == 0) {
stop("No valid peaks left after filtering. Please check your input peaks.")
}
#generate window starts and ends across span
windows <- seq(from=0, to=regionwidth-step, by=step)
binmids <- windows - regionwidth/2 + step/2
#generate sliding windows across peaks
tryCatch({
peaks2plus <- unlist(GenomicRanges::slidingWindows(peaks[strand(peaks) != "-"], width = step, step = step), use.names=FALSE)
peaks2minus <- rev(unlist(GenomicRanges::slidingWindows(peaks[strand(peaks) == "-"], width = step, step = step), use.names=FALSE))
peaks2 <- c(peaks2plus, peaks2minus)
}, error = function(e) {
stop(paste0("Error generating sliding windows: ", conditionMessage(e)))
})
#generate saf format data frame
saf <- data.frame(GeneID = names(peaks2), Chr = seqnames(peaks2),
Start = start(peaks2), End = end(peaks2), Strand = strand(peaks2))
#calculate the number of reads in each window for all bam files specified
tryCatch({
f_counts <- featureCounts(bamFiles, annot.ext=saf, useMetaFeatures=FALSE, allowMultiOverlap=TRUE, read2pos=read2pos,
minOverlap=minOverlap, readExtension3=readExtension3, countMultiMappingReads=FALSE, fraction=FALSE,
minMQS=minMQS, strandSpecific=strand, nthreads=20, verbose=FALSE, isPairedEnd=PairedEnd,
splitOnly=splitOnly, nonSplitOnly=nonSplitOnly, readShiftType="downstream", readShiftSize=readShiftSize,
requireBothEndsMapped=requireBothEndsMapped)
}, error = function(e) {
stop(paste0("Error in featureCounts: ", conditionMessage(e),
"\nMake sure your BAM files are properly formatted and accessible."))
})
#extract gene annotation (for peak names)
GeneID <- matrix(names(peaks2), nrow=length(peaks), ncol=length(windows), byrow=TRUE)[,1]
#extract number of mapped reads for all bam samples
if (length(bamNames) > 1) {
mapped.reads <- apply(f_counts$stat[c(1,11,12), -1], 2, sum)
} else {
mapped.reads <- sum(f_counts$stat[c(1,11,12), -1])
}
#extract counts for all samples, name columns and rows, calculate log CPM
all.counts <- vector("list", length(bamNames))
for (bam.sample in seq_along(bamNames)) {
counts <- matrix(f_counts$counts[, bam.sample], nrow=length(peaks), ncol=length(windows), byrow=TRUE)
colnames(counts) <- binmids
rownames(counts) <- GeneID
#prepare counts or cpm for saving as list of matrices
if (useCPM == TRUE) {
all.counts[[bam.sample]] <- ((counts)/mapped.reads[bam.sample])*1000000
} else {
all.counts[[bam.sample]] <- counts
}
# Check if there are names to sort by
if (!is.null(names(peaks)) && length(names(peaks)) == nrow(counts)) {
all.counts[[bam.sample]] <- all.counts[[bam.sample]][names(peaks), ]
}
}
# give warning if all counts are 0
if (max(all.counts[[1]]) == 0) {
warning("All windows have 0 counts, make sure the chromosome names (seqnames) match between your GRanges object and bam files!")
}
#return the results
names(all.counts) <- bamNames
return(all.counts)
} else {
stop("mode must be one of Q (QuasR) or F (FeatureCounts)!")
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.