# This file contains the functions used in performing the peak annotation
# process
# Author: Daniel Fusca
#' Map peaks to closest features
#'
#' Annotate a set of genomic coordinates by finding the closest genomic features
#' supplied by the user. For mapping purposes, peaks and features without strand
#' information (i.e strand of ".") are assumed to be on the forward strand.
#' Peaks are considered to be a distance of 0 away from any features they
#' overlap with. A single peak will map to multiple features if these features
#' are the same distance away. In this case, the same peak will appear in the
#' output multiple times.
#'
#' Aside from overlap percentages, the structure of the returned dataframe is
#' based on the annotation output produced by ChIPseeker (see References),
#' although no code here is actually taken from ChIPseeker. Use of dplyr's
#' bind_rows function to combine a list of dataframes into 1 dataframe is based
#' off of StackOverflow post by Joe Klieg
#' (see References).
#'
#' @param peakFrame A dataframe returned by the importBED function, where each
#' row is a different peak to be annotated.
#' @param featureFrame A dataframe returned by the importBED function, where
#' each row is a different feature to be used for peak annotation.
#' @param verbose If True, mapPeaks will print to the console every time it
#' begins processing a new peak (this is useful for monitoring function calls
#' on large datasets that may take a while to finish). Will also print to the
#' console peaks that do not map onto any features (e.g. due to differing
#' chromosomes).
#'
#' @return A dataframe giving the nearest feature(s) in featureFrame to each
#' peak in peakFrame. Every row in the returned dataframe gives a peak, a
#' feature of minimal distance, and the distance and position of the peak
#' relative to the gene, where position is some combination of upstream,
#' downstream, and overlapping (see documentation of getDistanceToPeak for
#' further details on position). The final column gives the percentage of
#' the feature that is overlapped by the mapped peak. The same peak may appear
#' in multiple consecutive rows if the peak maps onto multiple different
#' genes (e.g. in the case of multiple overlaps). Peaks are not included
#' in the output if there are no features on the same chromosome. Returns
#' NULL if none of the peaks map onto any features (due to being on
#' different chromosomes).
#'
#' @examples
#' \dontrun{
#' pathToPeaks <- system.file("extdata",
#' "H3K27me3Peaks.bed", package = "PeakMapper")
#' pathToGenes <- system.file("extdata",
#' "WS263Genes.bed", package = "PeakMapper")
#' H3K27me3Peaks <- importBED(pathToPeaks)
#' WS263Genes <- importBED(pathToGenes)
#' mappingResults <- mapPeaks(H3K27me3Peaks, WS263Genes)
#' mappingResults$Peak_Position
#' mappingResults$Peak_Distance
#' }
#'
#' @references
#' Guangchuang Yu, Li-Gen Wang, and Qing-Yu He. ChIPseeker: an R/Bioconductor
#' package for ChIP peak annotation, comparison and visualization.
#' Bioinformatics 2015, 31(14):2382-2383
#'
#' Joe Klieg. "Convert a list of data frames into one data frame". 27 February
#' 2018. Accessed 25 September 2019. https://stackoverflow.com/a/49017065
#'
#' @export
#' @import dplyr
mapPeaks <- function(peakFrame, featureFrame, verbose = F) {
# Check that the dataframes of peak and feature coordinates match the
# expected BED format, as returned by importBED. This checking is done by a
# helper function in checkInput.R, and raises an error if issues are detected
# with these dataframes. To simplify error messages, any additional
# warnings produced by R related to these errors are suppressed.
suppressWarnings({
checkBEDInput(peakFrame)
checkBEDInput(featureFrame)
})
# Call getFeaturesForPeak on every peak in peakFrame
listOfResults <- apply(peakFrame, 1, getFeaturesForPeak, featureFrame,
verbose)
# Use of dplyr's bind_rows function to combine a list of dataframes into 1
# dataframe based off of StackOverflow post by Joe Klieg
# (https://stackoverflow.com/a/49017065)
result <- dplyr::bind_rows(listOfResults)
# If none of the peaks in peakFrame map to any of the features in featureFrame,
# return NULL
if (dim(result)[1] == 0) {
return(invisible(NULL))
} else {
# Calculate the percent of each feature that is overlapped by each mapped peak
overlapPercents <- apply(result, 1, getOverlapPercent)
result$Percent_Overlap <- overlapPercents
# Aside from overlap percentages, structure of the result dataframe (peak
# information, feature information, and distance and position of peak relative
# to feature) is based on the annotation output produced by ChIPseeker
# (Guangchuang Yu, Li-Gen Wang, and Qing-Yu He. ChIPseeker: an R/Bioconductor
# package for ChIP peak annotation, comparison and visualization.
# Bioinformatics 2015, 31(14):2382-2383)
return(result)
}
}
#' Return fraction of feature overlapped by peak
#'
#' Given a mapping between a peak and a feature as determined by the mapPeaks
#' function, returns the proportion of the feature that is overlapped by the
#' peak (i.e. the fraction of bases within the feature that are also within the
#' peak). Output is 0 if peak does not overlap feature. Note that this is a helper
#' function intended for use by the mapPeaks function; the user should NOT call
#' this function themselves.
#'
#' @param resultRow A mapping between a peak and a feature, as returned by
#' mapPeaks (except for the final column giving overlap proportions, which are
#' calculated here). The first 6 values give peak information (chromosome,
#' start, end name, score, and strand), the next 6 values give feature
#' information, and the final 2 values are peak position and distance relative
#' to feature.
#'
#' @return The proportion of the given feature that is overlapped by the given
#' peak. 0 if the feature does not overlap the peak.
#'
#' @examples
#' \dontrun{
#' peakAndFeature <- c("chr2", 1250, 5000, "Peak_5", 0, ".", "chr2", 1000, 2000,
#' "Gene_1", 0, "-", "Overlap (upstream)", 0)
#' overlapPercent <- getOverlapPercent(peakAndFeature)
#' floor(overlapPercent) == 75
#' }
#'
getOverlapPercent <- function(resultRow) {
peakDistance <- as.numeric(resultRow[14])
if (peakDistance != 0) {
# This peak does not overlap this feature
overlapPercent = 0
} else {
# This peak overlaps this feature
featureLeftEnd <- as.numeric(resultRow[8])
featureRightEnd <- as.numeric(resultRow[9])
peakLeftEnd <- as.numeric(resultRow[2])
peakRightEnd <- as.numeric(resultRow[3])
featureLength <- featureRightEnd - featureLeftEnd + 1
overlapLength <- min(peakRightEnd, featureRightEnd) -
max(peakLeftEnd, featureLeftEnd) + 1
overlapPercent <- 100 * overlapLength / featureLength
}
return(overlapPercent)
}
#' Find closest features to a single peak
#'
#' Given a single peak and a set of features, this function returns the features
#' that are closest to this peak. Features overlapping the peak are considered
#' to have a distance of 0. Multiple features will be returned if they all have
#' the same minimal distance from the given peak. Nothing is returned if no
#' features are on the same chromosome as the peak. Note that this is a helper
#' function intended for use by the mapPeaks function; the user should NOT call
#' this function themselves.
#'
#' Code to rename column names is based off of a StackOverflow post by Joshua
#' Ulrich (see References). Code to repeat rows of a
#' dataframe using do.call is based on a StackOverflow post by mdsummer
#' (see References).
#'
#' @param currentPeak A single peak, given as a vector of 6 fields:
#' chromosome, start position, end position, name, score, and strand (i.e. a
#' single row from a dataframe returned by importBED)
#' @param featureFrame A dataframe returned by the importBED function, where
#' each row is a different feature to be used for peak annotation.
#' @param verbose If True, getFeaturesForPeak will print to the console the peak
#' that it is currently processing, and will also print peaks that do not map
#' onto any features (e.g. due to differing chromosomes)
#'
#' @return A dataframe containing the feature(s) closest to currentPeak. Each
#' row contains a different feature of minimal distance, preceded by
#' currentPeak. Peaks and features are represented by the 6 fields of:
#' chromosome, start position, end position, name, score, and strand. Each row
#' additionally contains the distance and position of currentPeak relative to
#' each closest feature, where position is some combination of upstream,
#' downstream, and overlapping.
#'
#' @examples
#' \dontrun{
#' pathToPeaks <- system.file("extdata",
#' "H3K27me3Peaks.bed", package = "PeakMapper")
#' pathToGenes <- system.file("extdata",
#' "WS263Genes.bed", package = "PeakMapper")
#' H3K27me3Peaks <- importBED(pathToPeaks)
#' WS263Genes <- importBED(pathToGenes)
#' singlePeak <- as.character(H3K27me3Peaks[1, ])
#' closestGenes <- getFeaturesForPeak(singlePeak, WS263Genes)
#' closestGenes$Peak_Position
#' closestGenes$Peak_Distance
#' }
#'
#' @references
#' Joshua Ulrich. "How to rename a single column in a data.frame?". 23 September
#' 2011. Accessed 25 September 2019. https://stackoverflow.com/a/7532464
#'
#' mdsummer. "Repeat rows of a data.frame N times". 6 January 2012. Accessed
#' 25 September 2019. https://stackoverflow.com/a/8753732
#'
getFeaturesForPeak <- function(currentPeak, featureFrame, verbose = F) {
if (verbose) {
message(paste("Currently processing peak", currentPeak[4]))
}
# Get only the features on the same chromosome as currentPeak
sameChromFeatures <- featureFrame[which(featureFrame[1] == currentPeak[1]), ]
# Get the relative distance and position of each of these features using
# getDistanceToPeak
positionAndDistance <- apply(sameChromFeatures, 1, getDistanceToPeak,
currentPeak)
if (is.null(dim(positionAndDistance))) {
# No features are on the same chromosome as currentPeak
if (verbose) {
print(paste(currentPeak[4], "did not map to any features."))
}
return(invisible(NULL))
} else {
# First row of positionAndDistance gives the positions for each feature, and
# second row gives the distances
minDistance <- min(as.numeric(positionAndDistance[2, ]))
# Get all of the features with the minimum distance to currentPeak
closestFeatures <- sameChromFeatures[which(positionAndDistance[2, ] ==
minDistance), ]
# Add peak orientation and distance information to results
peakPosition <- positionAndDistance[1, which(positionAndDistance[2, ] ==
minDistance)]
closestFeatures$Peak_Position <- peakPosition
closestFeatures$Peak_Distance <- minDistance
# Update column names to distinguish peak data from feature data.
# Code to rename columns based off of StackOverflow post by Joshua Ulrich
# (https://stackoverflow.com/a/7532464)
colnames(closestFeatures)[1:6] <- paste("Feature",
colnames(closestFeatures)[1:6],
sep = "_")
# If currentPeak maps to multiple features, we want the output to repeat
# currentPeak's information for each of these features
# numClosestFeatures gives how many features each peak mapped to
numClosestFeatures <- dim(closestFeatures)[1]
# Add in currentPeak's information to each row of the results.
# Code to repeat rows of a dataframe using do.call is based on a
# StackOverflow post by mdsummer (https://stackoverflow.com/a/8753732).
currentPeakRepeated <- do.call("rbind", replicate(numClosestFeatures,
currentPeak, simplify = F))
mappingResult <- cbind(currentPeakRepeated, closestFeatures,
stringsAsFactors = F)
# Again, update column names to distinguish peak data from feature data.
# Code to rename columns based off of StackOverflow post by Joshua Ulrich
# (https://stackoverflow.com/a/7532464)
colnames(mappingResult)[1:6] <- paste("Peak", colnames(mappingResult)[1:6],
sep = "_")
return(mappingResult)
}
}
#' Calculate distance between peak and feature
#'
#' Given a peak and a feature, returns the position and distance of the peak
#' relative to the feature. If currentFeature and currentPeak overlap, the
#' distance between them is 0. Otherwise, the distance between them gives how
#' many bases away an endpoint of currentFeature is from an endpoint of
#' currentPeak, using the endpoints closest to each other. Position of the peak
#' relative to the feature is some combination of upstream, downstream, and
#' overlapping. Peaks and features without strand information are assumed to be
#' on the forward strand. Note that this is a helper function intended for use
#' by the mapPeaks function; the user should NOT call this function themselves.
#'
#' @param currentFeature A single feature, given as the following 6 fields:
#' chromosome, start position, end position, name, score, and strand (i.e. a
#' single row from a dataframe returned by importBED)
#' @param currentPeak A single peak, given as the following 6 fields:
#' chromosome, start position, end position, name, score, and strand (i.e. a
#' single row from a dataframe returned by importBED)
#'
#' @return A vector containing 2 items. The first item describes the position of
#' the peak relative to the feature. This is one of:
#' \itemize{
#' \item "Downstream (no overlap)" - the peak is downstream of the
#' feature but does not overlap it
#' \item "Overlap (downstream)" - the peak overlaps with the feature
#' and also extends downstream of the feature, but not upstream
#' \item "Overlap (upstream and downstream)" - the feature is contained
#' entirely within the peak
#' \item "Overlap (upstream)" - the peak overlaps with the feature and
#' also extends upstream of the feature, but not downstream
#' \item "Overlap (within feature only)" - the peak is contained
#' entirely with the feature
#' \item "Upstream (no overlap)" - the peak is upstream of the feature
#' but does not overlap it
#' }
#' The second item in the vector gives the distance between the peak and the
#' feature. If the two overlap, this distance is 0. Otherwise, this distance
#' gives how many bases away an endpoint of currentPeak is from an endpoint of
#' currentFeature, using the endpoints that are closest together.
#'
#' @examples
#' \dontrun{
#' pathToPeaks <- system.file("extdata",
#' "H3K27me3Peaks.bed", package = "PeakMapper")
#' pathToGenes <- system.file("extdata",
#' "WS263Genes.bed", package = "PeakMapper")
#' H3K27me3Peaks <- importBED(pathToPeaks)
#' WS263Genes <- importBED(pathToGenes)
#' singlePeak <- H3K27me3Peaks[1, ]
#' singleGene <- WS263Genes[1, ]
#' positionAndDistance <- getDistanceToPeak(singleGene, singlePeak)
#' position <- positionAndDistance[1]
#' distance <- positionAndDistance[2]
#' }
#'
getDistanceToPeak <- function(currentFeature, currentPeak) {
peakLeftPoint <- as.numeric(currentPeak[2])
peakRightPoint <- as.numeric(currentPeak[3])
featureLeftPoint <- as.numeric(currentFeature[2])
featureRightPoint <- as.numeric(currentFeature[3])
if (currentFeature[6] == "+" || currentFeature[6] == "-") {
featureStrand <- currentFeature[6]
} else {
# For upstream/downstream classification, features without strand
# information are assumed to be on the forward strand
featureStrand <- "+"
}
if (peakLeftPoint <= featureRightPoint && peakRightPoint >= featureLeftPoint) {
# currentFeature overlaps with currentPeak
distance <- 0
if (peakLeftPoint <= featureLeftPoint && peakRightPoint >= featureRightPoint) {
# currentFeature is contained entirely within currentPeak
position <- "Overlap (upstream and downstream)"
} else if ((peakLeftPoint <= featureLeftPoint && featureStrand == "+") ||
(peakRightPoint >= featureRightPoint && featureStrand == "-")) {
# currentPeak is upstream of currentFeature and overlaps it
position <- "Overlap (upstream)"
} else if ((peakLeftPoint <= featureLeftPoint && featureStrand == "-") ||
(peakRightPoint >= featureRightPoint && featureStrand == "+")) {
# currentPeak is downstream of currentFeature and overlaps it
position <- "Overlap (downstream)"
} else {
# currentPeak is contained entirely within currentFeature
position <- "Overlap (within feature only)"
}
} else if ((peakRightPoint < featureLeftPoint && featureStrand == "+") ||
(peakLeftPoint > featureRightPoint && featureStrand == "-")) {
# currentPeak is upstream of currentFeature but does not overlap it
position <- "Upstream (no overlap)"
if (featureStrand == "+") {
distance <- featureLeftPoint - peakRightPoint
} else {
distance <- peakLeftPoint - featureRightPoint
}
} else {
# Either peakRightPoint < featureLeftPoint and featureStrand == "-", or
# peakLeftPoint > featureRightPoint and featureStrand == "+", meaning
# currentPeak is downstream of currentFeature but does not overlap it
position <- "Downstream (no overlap)"
if (featureStrand == "+") {
distance <- peakLeftPoint - featureRightPoint
} else {
distance <- featureLeftPoint - peakRightPoint
}
}
positionAndDistance <- c(position, distance)
return(positionAndDistance)
}
# [END]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.