R/peakMapping.R

Defines functions mapPeaks getOverlapPercent getFeaturesForPeak getDistanceToPeak

Documented in getDistanceToPeak getFeaturesForPeak getOverlapPercent mapPeaks

# This file contains the functions used in performing the peak annotation
# process
# Author: Daniel Fusca


#' Map peaks to closest features
#'
#' Annotate a set of genomic coordinates by finding the closest genomic features
#' supplied by the user. For mapping purposes, peaks and features without strand
#' information (i.e strand of ".") are assumed to be on the forward strand.
#' Peaks are considered to be a distance of 0 away from any features they
#' overlap with. A single peak will map to multiple features if these features
#' are the same distance away. In this case, the same peak will appear in the
#' output multiple times.
#'
#' Aside from overlap percentages, the structure of the returned dataframe is
#' based on the annotation output produced by ChIPseeker (see References),
#' although no code here is actually taken from ChIPseeker. Use of dplyr's
#' bind_rows function to combine a list of dataframes into 1 dataframe is based
#' off of StackOverflow post by Joe Klieg
#' (see References).
#'
#' @param peakFrame A dataframe returned by the importBED function, where each
#'   row is a different peak to be annotated.
#' @param featureFrame A dataframe returned by the importBED function, where
#'   each row is a different feature to be used for peak annotation.
#' @param verbose If True, mapPeaks will print to the console every time it
#'   begins processing a new peak (this is useful for monitoring function calls
#'   on large datasets that may take a while to finish). Will also print to the
#'   console peaks that do not map onto any features (e.g. due to differing
#'   chromosomes).
#'
#' @return A dataframe giving the nearest feature(s) in featureFrame to each
#'   peak in peakFrame. Every row in the returned dataframe gives a peak, a
#'   feature of minimal distance, and the distance and position of the peak
#'   relative to the gene, where position is some combination of upstream,
#'   downstream, and overlapping (see documentation of getDistanceToPeak for
#'   further details on position). The final column gives the percentage of
#'   the feature that is overlapped by the mapped peak. The same peak may appear
#'   in multiple consecutive rows if the peak maps onto multiple different
#'   genes (e.g. in the case of multiple overlaps). Peaks are not included
#'   in the output if there are no features on the same chromosome. Returns
#'   NULL if none of the peaks map onto any features (due to being on
#'   different chromosomes).
#'
#' @examples
#' \dontrun{
#'   pathToPeaks <- system.file("extdata",
#'      "H3K27me3Peaks.bed", package = "PeakMapper")
#'   pathToGenes <- system.file("extdata",
#'      "WS263Genes.bed", package = "PeakMapper")
#'   H3K27me3Peaks <- importBED(pathToPeaks)
#'   WS263Genes <- importBED(pathToGenes)
#'   mappingResults <- mapPeaks(H3K27me3Peaks, WS263Genes)
#'   mappingResults$Peak_Position
#'   mappingResults$Peak_Distance
#' }
#'
#' @references
#' Guangchuang Yu, Li-Gen Wang, and Qing-Yu He. ChIPseeker: an R/Bioconductor
#' package for ChIP peak annotation, comparison and visualization.
#' Bioinformatics 2015, 31(14):2382-2383
#'
#' Joe Klieg. "Convert a list of data frames into one data frame". 27 February
#' 2018. Accessed 25 September 2019.  https://stackoverflow.com/a/49017065
#'
#' @export
#' @import dplyr
mapPeaks <- function(peakFrame, featureFrame, verbose = F) {

  # Check that the dataframes of peak and feature coordinates match the
  # expected BED format, as returned by importBED. This checking is done by a
  # helper function in checkInput.R, and raises an error if issues are detected
  # with these dataframes. To simplify error messages, any additional
  # warnings produced by R related to these errors are suppressed.
  suppressWarnings({
    checkBEDInput(peakFrame)
    checkBEDInput(featureFrame)
  })

  # Call getFeaturesForPeak on every peak in  peakFrame
  listOfResults <- apply(peakFrame, 1, getFeaturesForPeak, featureFrame,
                         verbose)

  # Use of dplyr's bind_rows function to combine a list of dataframes into 1
  # dataframe based off of StackOverflow post by Joe Klieg
  # (https://stackoverflow.com/a/49017065)
  result <- dplyr::bind_rows(listOfResults)


  # If none of the peaks in peakFrame map to any of the features in featureFrame,
  # return NULL
  if (dim(result)[1] == 0) {
    return(invisible(NULL))
  } else {
    # Calculate the percent of each feature that is overlapped by each mapped peak
    overlapPercents <- apply(result, 1, getOverlapPercent)
    result$Percent_Overlap <- overlapPercents

    # Aside from overlap percentages, structure of the result dataframe (peak
    # information, feature information, and distance and position of peak relative
    # to feature) is based on the annotation output produced by ChIPseeker
    # (Guangchuang Yu, Li-Gen Wang, and Qing-Yu He. ChIPseeker: an R/Bioconductor
    # package for ChIP peak annotation, comparison and visualization.
    # Bioinformatics 2015, 31(14):2382-2383)
    return(result)
  }
}



#' Return fraction of feature overlapped by peak
#'
#' Given a mapping between a peak and a feature as determined by the mapPeaks
#' function, returns the proportion of the feature that is overlapped by the
#' peak (i.e. the fraction of bases within the feature that are also within the
#' peak). Output is 0 if peak does not overlap feature. Note that this is a helper
#' function intended for use by the mapPeaks function; the user should NOT call
#' this function themselves.
#'
#' @param resultRow A mapping between a peak and a feature, as returned by
#'   mapPeaks (except for the final column giving overlap proportions, which are
#'   calculated here). The first 6 values give peak information (chromosome,
#'   start, end name, score, and strand), the next 6 values give feature
#'   information, and the final 2 values are peak position and distance relative
#'   to feature.
#'
#' @return The proportion of the given feature that is overlapped by the given
#'   peak. 0 if the feature does not overlap the peak.
#'
#' @examples
#' \dontrun{
#'   peakAndFeature <- c("chr2", 1250, 5000, "Peak_5", 0, ".", "chr2", 1000, 2000,
#'   "Gene_1", 0, "-", "Overlap (upstream)", 0)
#'   overlapPercent <- getOverlapPercent(peakAndFeature)
#'   floor(overlapPercent) == 75
#' }
#'
getOverlapPercent <- function(resultRow) {
  peakDistance <- as.numeric(resultRow[14])

  if (peakDistance != 0) {
    # This peak does not overlap this feature
    overlapPercent = 0
  } else {
    # This peak overlaps this feature
    featureLeftEnd <- as.numeric(resultRow[8])
    featureRightEnd <- as.numeric(resultRow[9])
    peakLeftEnd <- as.numeric(resultRow[2])
    peakRightEnd <- as.numeric(resultRow[3])

    featureLength <-  featureRightEnd - featureLeftEnd + 1
    overlapLength <- min(peakRightEnd, featureRightEnd) -
      max(peakLeftEnd, featureLeftEnd) + 1

    overlapPercent <- 100 * overlapLength / featureLength
  }

  return(overlapPercent)
}


#' Find closest features to a single peak
#'
#' Given a single peak and a set of features, this function returns the features
#' that are closest to this peak. Features overlapping the peak are considered
#' to have a distance of 0. Multiple features will be returned if they all have
#' the same minimal distance from the given peak. Nothing is returned if no
#' features are on the same chromosome as the peak. Note that this is a helper
#' function intended for use by the mapPeaks function; the user should NOT call
#' this function themselves.
#'
#' Code to rename column names is based off of a StackOverflow post by Joshua
#' Ulrich (see References). Code to repeat rows of a
#' dataframe using do.call is based on a StackOverflow post by mdsummer
#' (see References).
#'
#' @param currentPeak A single peak, given as a vector of 6 fields:
#'   chromosome, start position, end position, name, score, and strand (i.e. a
#'   single row from a dataframe returned by importBED)
#' @param featureFrame A dataframe returned by the importBED function, where
#'   each row is a different feature to be used for peak annotation.
#' @param verbose If True, getFeaturesForPeak will print to the console the peak
#'   that it is currently processing, and will also print peaks that do not map
#'   onto any features (e.g. due to differing chromosomes)
#'
#' @return A dataframe containing the feature(s) closest to currentPeak. Each
#'   row contains a different feature of minimal distance, preceded by
#'   currentPeak. Peaks and features are represented by the 6 fields of:
#'   chromosome, start position, end position, name, score, and strand. Each row
#'   additionally contains the distance and position of currentPeak relative to
#'   each closest feature, where position is some combination of upstream,
#'   downstream, and overlapping.
#'
#' @examples
#' \dontrun{
#'   pathToPeaks <- system.file("extdata",
#'      "H3K27me3Peaks.bed", package = "PeakMapper")
#'   pathToGenes <- system.file("extdata",
#'      "WS263Genes.bed", package = "PeakMapper")
#'   H3K27me3Peaks <- importBED(pathToPeaks)
#'   WS263Genes <- importBED(pathToGenes)
#'   singlePeak <- as.character(H3K27me3Peaks[1, ])
#'   closestGenes <- getFeaturesForPeak(singlePeak, WS263Genes)
#'   closestGenes$Peak_Position
#'   closestGenes$Peak_Distance
#' }
#'
#' @references
#' Joshua Ulrich. "How to rename a single column in a data.frame?". 23 September
#' 2011. Accessed 25 September 2019. https://stackoverflow.com/a/7532464
#'
#' mdsummer. "Repeat rows of a data.frame N times". 6 January 2012. Accessed
#' 25 September 2019. https://stackoverflow.com/a/8753732
#'
getFeaturesForPeak <- function(currentPeak, featureFrame, verbose = F) {
  if (verbose) {
    message(paste("Currently processing peak", currentPeak[4]))
  }

  # Get only the features on the same chromosome as currentPeak
  sameChromFeatures <- featureFrame[which(featureFrame[1] == currentPeak[1]), ]

  # Get the relative distance and position of each of these features using
  # getDistanceToPeak
  positionAndDistance <- apply(sameChromFeatures, 1, getDistanceToPeak,
                               currentPeak)

  if (is.null(dim(positionAndDistance))) {
    # No features are on the same chromosome as currentPeak
    if (verbose) {
      print(paste(currentPeak[4], "did not map to any features."))
    }
    return(invisible(NULL))

  } else {
    # First row of positionAndDistance gives the positions for each feature, and
    # second row gives the distances
    minDistance <- min(as.numeric(positionAndDistance[2, ]))

    # Get all of the features with the minimum distance to currentPeak
    closestFeatures <- sameChromFeatures[which(positionAndDistance[2, ] ==
                                                 minDistance), ]

    # Add peak orientation and distance information to results
    peakPosition <- positionAndDistance[1, which(positionAndDistance[2, ] ==
                                                   minDistance)]
    closestFeatures$Peak_Position <- peakPosition
    closestFeatures$Peak_Distance <- minDistance

    # Update column names to distinguish peak data from feature data.
    # Code to rename columns based off of StackOverflow post by Joshua Ulrich
    # (https://stackoverflow.com/a/7532464)
    colnames(closestFeatures)[1:6] <- paste("Feature",
                                            colnames(closestFeatures)[1:6],
                                            sep = "_")

    # If currentPeak maps to multiple features, we want the output to repeat
    # currentPeak's information for each of these features

    # numClosestFeatures gives how many features each peak mapped to
    numClosestFeatures <- dim(closestFeatures)[1]

    # Add in currentPeak's information to each row of the results.
    # Code to repeat rows of a dataframe using do.call is based on a
    # StackOverflow post by mdsummer (https://stackoverflow.com/a/8753732).
    currentPeakRepeated <- do.call("rbind", replicate(numClosestFeatures,
                                                      currentPeak, simplify = F))
    mappingResult <- cbind(currentPeakRepeated, closestFeatures,
                           stringsAsFactors = F)

    # Again, update column names to distinguish peak data from feature data.
    # Code to rename columns based off of StackOverflow post by Joshua Ulrich
    # (https://stackoverflow.com/a/7532464)
    colnames(mappingResult)[1:6] <- paste("Peak", colnames(mappingResult)[1:6],
                                          sep = "_")

    return(mappingResult)
  }
}


#' Calculate distance between peak and feature
#'
#' Given a peak and a feature, returns the position and distance of the peak
#' relative to the feature. If currentFeature and currentPeak overlap, the
#' distance between them is 0. Otherwise, the distance between them gives how
#' many bases away an endpoint of currentFeature is from an endpoint of
#' currentPeak, using the endpoints closest to each other. Position of the peak
#' relative to the feature is some combination of upstream, downstream, and
#' overlapping. Peaks and features without strand information are assumed to be
#' on the forward strand. Note that this is a helper function intended for use
#' by the mapPeaks function; the user should NOT call this function themselves.
#'
#' @param currentFeature A single feature, given as the following 6 fields:
#'   chromosome, start position, end position, name, score, and strand (i.e. a
#'   single row from a dataframe returned by importBED)
#' @param currentPeak A single peak, given as the following 6 fields:
#'   chromosome, start position, end position, name, score, and strand (i.e. a
#'   single row from a dataframe returned by importBED)
#'
#' @return A vector containing 2 items. The first item describes the position of
#'   the peak relative to the feature. This is one of:
#'   \itemize{
#'      \item "Downstream (no overlap)" - the peak is downstream of the
#'          feature but does not overlap it
#'      \item "Overlap (downstream)" - the peak overlaps with the feature
#'          and also extends downstream of the feature, but not upstream
#'      \item "Overlap (upstream and downstream)" - the feature is contained
#'          entirely within the peak
#'      \item "Overlap (upstream)" - the peak overlaps with the feature and
#'          also extends upstream of the feature, but not downstream
#'      \item "Overlap (within feature only)" - the peak is contained
#'          entirely with the feature
#'      \item "Upstream (no overlap)" - the peak is upstream of the feature
#'          but does not overlap it
#'   }
#'   The second item in the vector gives the distance between the peak and the
#'   feature. If the two overlap, this distance is 0. Otherwise, this distance
#'   gives how many bases away an endpoint of currentPeak is from an endpoint of
#'   currentFeature, using the endpoints that are closest together.
#'
#' @examples
#' \dontrun{
#'   pathToPeaks <- system.file("extdata",
#'      "H3K27me3Peaks.bed", package = "PeakMapper")
#'   pathToGenes <- system.file("extdata",
#'      "WS263Genes.bed", package = "PeakMapper")
#'   H3K27me3Peaks <- importBED(pathToPeaks)
#'   WS263Genes <- importBED(pathToGenes)
#'   singlePeak <- H3K27me3Peaks[1, ]
#'   singleGene <- WS263Genes[1, ]
#'   positionAndDistance <- getDistanceToPeak(singleGene, singlePeak)
#'   position <- positionAndDistance[1]
#'   distance <- positionAndDistance[2]
#' }
#'
getDistanceToPeak <- function(currentFeature, currentPeak) {
  peakLeftPoint <- as.numeric(currentPeak[2])
  peakRightPoint <- as.numeric(currentPeak[3])
  featureLeftPoint <- as.numeric(currentFeature[2])
  featureRightPoint <- as.numeric(currentFeature[3])

  if (currentFeature[6] == "+" || currentFeature[6] == "-") {
    featureStrand <- currentFeature[6]
  } else {
    # For upstream/downstream classification, features without strand
    # information are assumed to be on the forward strand
    featureStrand <- "+"
  }


  if (peakLeftPoint <= featureRightPoint && peakRightPoint >= featureLeftPoint) {
    # currentFeature overlaps with currentPeak
    distance <- 0

    if (peakLeftPoint <= featureLeftPoint && peakRightPoint >= featureRightPoint) {
      # currentFeature is contained entirely within currentPeak
      position <- "Overlap (upstream and downstream)"
    } else if ((peakLeftPoint <= featureLeftPoint && featureStrand == "+") ||
               (peakRightPoint >= featureRightPoint && featureStrand == "-")) {
      # currentPeak is upstream of currentFeature and overlaps it
      position <- "Overlap (upstream)"
    } else if ((peakLeftPoint <= featureLeftPoint && featureStrand == "-") ||
               (peakRightPoint >= featureRightPoint && featureStrand == "+")) {
      # currentPeak is downstream of currentFeature and overlaps it
      position <- "Overlap (downstream)"
    } else {
      # currentPeak is contained entirely within currentFeature
      position <- "Overlap (within feature only)"
    }


  } else if ((peakRightPoint < featureLeftPoint && featureStrand == "+") ||
             (peakLeftPoint > featureRightPoint && featureStrand == "-")) {

    # currentPeak is upstream of currentFeature but does not overlap it
    position <- "Upstream (no overlap)"

    if (featureStrand == "+") {
      distance <- featureLeftPoint - peakRightPoint
    } else {
      distance <- peakLeftPoint - featureRightPoint
    }

  } else {
    # Either peakRightPoint < featureLeftPoint and featureStrand == "-", or
    # peakLeftPoint > featureRightPoint and featureStrand == "+", meaning
    # currentPeak is downstream of currentFeature but does not overlap it
    position <- "Downstream (no overlap)"

    if (featureStrand == "+") {
      distance <- peakLeftPoint - featureRightPoint
    } else {
      distance <- featureLeftPoint - peakRightPoint
    }

  }
  positionAndDistance <- c(position, distance)
  return(positionAndDistance)
}

# [END]
fuscada2/PeakMapper documentation built on Dec. 8, 2019, 12:35 p.m.