R/summarizePatternInPeaks.R
In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

#' Output a summary of the occurrence of each pattern in the sequences.
#' 
#' Output a summary of the occurrence of each pattern in the sequences.
#' 
#' 
#' @param patternFilePath A character vector containing the path to the file to
#' read the patterns from.
#' @param format Either "fasta" (the default) or "fastq"
#' @param skip Single non-negative integer. The number of records of the
#' pattern file to skip before beginning to read in records.
#' @param BSgenomeName BSgenome object. Please refer to available.genomes in
#' BSgenome package for details
#' @param peaks \link[GenomicRanges:GRanges-class]{GRanges} containing the
#' peaks
#' @param outfile A character vector containing the path to the file to write
#' the summary output.
#' @param append TRUE or FALSE, default FALSE
#' @return A data frame with 3 columns as n.peaksWithPattern (number of peaks
#' with the pattern), n.totalPeaks (total number of peaks in the input) and
#' Pattern (the corresponding pattern). The summary will consider both strand
#' (including reverse complement).
#' @author Lihua Julie Zhu
#' @keywords misc
#' @export
#' @importFrom Biostrings readDNAStringSet reverseComplement
#' @importFrom utils write.table
#' @examples
#' 
#'     peaks = GRanges(seqnames=c("NC_008253", "NC_010468"), 
#'                     IRanges(start=c(100, 500), end=c(300, 600), 
#'                                names=c("peak1", "peak2")))
#'     filepath =system.file("extdata", "examplePattern.fa", 
#'                           package="ChIPpeakAnno")
#'     library(BSgenome.Ecoli.NCBI.20080805)
#'     summarizePatternInPeaks(patternFilePath=filepath, format="fasta", 
#'                             skip=0L, BSgenomeName=Ecoli, peaks=peaks)
#' 
summarizePatternInPeaks <-
  function (patternFilePath, format = "fasta", skip = 0L, BSgenomeName,
            peaks, outfile, append = FALSE)
  {
    if (missing(patternFilePath)) {
      stop("missing required parameter patternFilePath!")
    }
    if (!file.exists(patternFilePath)) {
      stop("patternFilePath specified as ", patternFilePath,
           " does not exsist!")
    }
    if (format != "fasta" && format != "fastq") {
      stop("format needs to be either fasta or fastq!")
    }
    if (missing(BSgenomeName) || !is(BSgenomeName, "BSgenome")) {
      stop("BSgenomeName is required as BSgenome object!")
    }
    if (missing(peaks) || (!is(peaks, "RangedData") && !is(peaks,
                                                           "GRanges"))) {
      stop("No valid peaks passed in. 
           It needs to be RangedData or GRanges object.")
    }
    if (is(peaks, "RangedData"))
      peaks <- toGRanges(peaks, format = "RangedData")
    if (!missing(outfile) && file.exists(outfile) && !append) {
      stop("outfile specified as ", outfile, 
           " already exists! Please rename the \n  
           outfile or set append = TRUE!")
    }
    seq = getAllPeakSequence(peaks, upstream = 0, downstream = 0,
                             genome = BSgenomeName)
    dict = readDNAStringSet(patternFilePath, format, use.names = TRUE)
    temp <- do.call(rbind, lapply(seq_along(dict), function(i) {
      getPosInSeqs(thispattern = dict[i], seq = seq, seqGRanges = peaks)
    }))
    if (!missing(outfile)) {
      write.table(temp, outfile, append = append, sep = "\t",
                  row.names = FALSE)
    }
    temp
  }


getPosInSeqs <-
  function(thispattern, seq, seqGRanges)
  {
    if (missing(thispattern) || !is(thispattern, "DNAStringSet")) {
      stop("thispattern is required as a DNAStringSet object!")
    }
    if (missing(seq)) {
      stop("No valid sequences passed in!")
    }
    patternName <- names(thispattern)
    revcomp.pattern = reverseComplement(thispattern)
    thispattern.t = as.character(thispattern)[[1]]
    thispattern.t = translatePattern(thispattern.t)
    revcomp.pattern.t = as.character(revcomp.pattern)[[1]]
    revcomp.pattern.t = translatePattern(revcomp.pattern.t)
    sequences <- seq$sequence
    seqInfo <- as.data.frame(seqGRanges)
    
    total <- do.call(rbind, lapply(seq_along(sequences), function(i) {
      pos.plus = gregexpr(thispattern.t, sequences[i], perl = TRUE)[[1]]
      pos.minus = gregexpr(revcomp.pattern.t, sequences[i], perl = TRUE)[[1]]
      if (pos.plus[1] > 0) {
        pattern.start <- as.numeric(pos.plus)
        pattern.end <- pattern.start + attr(pos.plus, "match.length") - 1
        patternSeq.found.plus <- 
          do.call(rbind, lapply(1:length(pattern.start), function(j){
          c(patternName, as.character(thispattern), 
            pattern.start[j], pattern.end[j], 
            substr(sequences[i], pattern.start[j], 
                   pattern.end[j]), "+", seqInfo[i,])
        }))
      }
      if (pos.minus[1] > 0) {
        pattern.start <- as.numeric(pos.minus)
        pattern.end <- pattern.start + attr(pos.minus, "match.length") - 1
        patternSeq.found.minus <- 
          do.call(rbind, lapply(1:length(pattern.start), function(j){
          c(patternName, as.character(thispattern), 
            pattern.start[j], pattern.end[j], 
            substr(sequences[i], pattern.start[j],
                   pattern.end[j]), "-",  seqInfo[i,])
        }))
        if (pos.plus[1] > 0)
          rbind(patternSeq.found.plus, patternSeq.found.minus)
        else
          patternSeq.found.minus
      }
      else if (pos.plus[1] > 0)
        patternSeq.found.plus
    }))
    if (length(total) == 0)
      cat( c(patternName, as.character(thispattern), 
             "not found in the input sequences!\n\n"))
    else
    {
      total <- as.data.frame(total)
      colnames(total)[1:6] <- c("motif name", "motif", "motifStartOffset",
                                "motifEndOffset", "motif found", 
                                "motifFoundStrand")
      chr <- as.character(total$seqnames)
      motifstart <- as.numeric(total$start) + as.numeric(total[,3]) -1
      motifend <- as.numeric(total$start) + as.numeric(total[,4]) -1
      total <- cbind(chr = chr, motifStart = motifstart, 
                     motifEnd = motifend, total)
      for (i in 1:dim(total)[2])
        total[,i] <- unlist(total[,i])
    }
    total
  }

Any scripts or data that you put into this service are public.

ChIPpeakAnno documentation built on April 1, 2021, 6:01 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

ChIPpeakAnno
Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

R/summarizePatternInPeaks.R
In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

Defines functions getPosInSeqs

Try the ChIPpeakAnno package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ChIPpeakAnno Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

R/summarizePatternInPeaks.R In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

Defines functions getPosInSeqs

Try the ChIPpeakAnno package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

ChIPpeakAnno
Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges

R/summarizePatternInPeaks.R
In ChIPpeakAnno: Batch annotation of the peaks identified from either ChIP-seq, ChIP-chip experiments or any experiments resulted in large number of chromosome ranges