R/p03_get_narrowpeak_summit_seq.R

Defines functions get_peak_summit_seq

Documented in get_peak_summit_seq

#' Get DNA sequence from peak summit region
#'
#' This function extract the DNA sequence around peak summit region from narrowPeak file
#'
#' @param file peak file generated by macs2
#' @param peakFormat Peak file format. One of \code{c("narrowPeak", "broadPeak", "bed")}
#' Default: narrowPeak
#' @param sampleId sample ID
#' @param genome BSgenome object
#' @param length Length of the region. Default: 200
#' @param column_suffix Whether to modify the column names to include sampleId.
#' If TRUE, the new column names are of format: colName.sampleId
#'
#' @return A dataframe with three columns: name, summitRegion, summitSeq
#' @export
#'
#' @examples
#' summitSeq = get_peak_summit_seq("test.narropeak", "sampleId", BSgenome.Afumigatus.AspGD.Af293, 200)
#'
get_peak_summit_seq = function(file, peakFormat = "narrowPeak", sampleId = NULL, genome,
                               length = 200, column_suffix = TRUE){

  peakFormat <- match.arg(arg = peakFormat, choices = c("narrowPeak", "broadPeak", "bed"))

  seqAroundSummit <- round(length / 2)

  np <- rtracklayer::import(con = file, format = peakFormat)

  if(length(np) == 0){
    return(NULL)
  }

  if(is.null(mcols(np)$peak)){
    mcols(np)$peak <- as.integer(width(np) / 2)
  }

  ## get the region around peak summit
  np <- GenomicRanges::resize(
    x = GenomicRanges::shift(x = np, shift = np$peak - seqAroundSummit),
    width = seqAroundSummit*2, fix = "start"
  )

  ## check if there are any out of bound ranges based on seqlengths and trim
  suppressWarnings(seqlengths(np) <- seqlengths(genome)[seqlevels(np)])
  np <- trim(np)

  ## get the DNA sequence for region
  np$summitSeq <- BSgenome::getSeq(x = genome, names = np, as.character = TRUE)

  motifSeq <- as.data.frame(np) %>%
    dplyr::mutate(summitSeqRegion = paste(seqnames, ":", start, "-", end, sep = "")) %>%
    dplyr::select(peakId = name, summitSeq, summitSeqRegion)

  if(!is.null(sampleId) && column_suffix){
    peakIdCol <- paste("peakId.", sampleId, sep = "")
    regionCol <- paste("summitSeqRegion.", sampleId, sep = "")
    seqCol <- paste("summitSeq.", sampleId, sep = "")

    motifSeq <- dplyr::rename(
      motifSeq,
      !! peakIdCol := peakId,
      !! regionCol := summitSeqRegion,
      !! seqCol := summitSeq
    )
  }

  return(motifSeq)
}
lakhanp1/chipmine documentation built on March 6, 2021, 9:06 a.m.