R/src_FindSequenceOccurrence.R

Defines functions FindSequenceOccurrence

Documented in FindSequenceOccurrence

#' Find occurrences of a DNA sequence in a BSgenome
#' 
#' Function takes a DNA sequence and then returns a GRanges object with the position
#' of that sequence (separated by strand) in a given BSgenome. Optionally one can
#' set a number of allowed mismatches. IUPAC ambiguity characters are supported. 
#' 
#' @param Sequence A DNA sequence
#' @param BSGenome a BSgenome object
#' @param StrictAmbiguity logical, how to deal with ambiguity characters, see Details.
#' @param MaxMismatches number of allowed mismatches
#' @param Cores number of cores for chromosome-wise search parallelization via mclapply
#' 
#' @details 
#' StrictAmbiguity => if TRUE (so strict) then an amb. character in the query, e.g. N is only counted as a match
#' if the reference contains the same character, so N in query must be N in reference. Else, character in reference 
#' can be any of the encoded characters, so N can be A/T/C/G in the reference.
#' 
#' @examples 
#' library(BSgenome.Ecoli.NCBI.20080805)
#' FindSequenceOccurrence(Sequence = "AGCTAAATTTGG", 
#'                        MaxMismatches = 1,
#'                        BSGenome = BSgenome.Ecoli.NCBI.20080805)
#'  
#' @author Alexander Toenges
#' 
#' @export
FindSequenceOccurrence <- function(Sequence, 
                                   BSGenome, 
                                   StrictAmbiguity = FALSE,
                                   MaxMismatches = 0,
                                   Cores=detectCores()){
  
  require(Biostrings)
  require(GenomicRanges)
  options(scipen=999)
  
  ## remove whitespaces from input DNA sequence
  Sequence <- DNAString( gsub(" ", "", as.character(Sequence)) )
  
  ## function to match sequence to genome:
  SearchGeneric <- function(dnastring) {
    
    ## parallel over all chromosomes using mclapply
    all.matches <- mclapply(1:length(seqnames(BSGenome)), function(x) {
      
      ## match string to chromosome using matchPattern
      tmp.match <- matchPattern(dnastring, BSGenome[[x]], 
                                max.mismatch = MaxMismatches, fixed = StrictAmbiguity)
      
      ## summarize match coordinates into GRanges format
      if (length(tmp.match) > 0) {
        tmp.gr    <- GRanges(seqnames = seqnames(BSGenome)[x],
                             ranges = ranges(tmp.match))
        elementMetadata(tmp.gr) <- data.frame(DNAStringSet(tmp.match))[,1]
        return(tmp.gr)
      }}, mc.cores = Cores)
    if(!is.null(unlist(all.matches))) return(all.matches)
    if(is.null(unlist(all.matches))) return(NULL)
  }
  
  message("Searching top strand")
  topstr <- SearchGeneric(Sequence)
  message("Searching bottom strand")
  botstr <- SearchGeneric(reverseComplement(Sequence))
  
  ## if no matches were found
  if(is.null(topstr) && is.null(botstr)) {
    message("No matches found on either strand")
    return(NULL)
  }
  
  ## combine output for all chromosomes
  if(!is.null(topstr)){
    tmp.top <- suppressWarnings(do.call("c", unlist(topstr)))
    strand(tmp.top) <- "+"
  } else tmp.top <- GRanges()
  if(!is.null(botstr)){
    tmp.bot <- suppressWarnings(do.call("c", unlist(botstr)))
    strand(tmp.bot) <- "-"
  } else tmp.bot <- GRanges()
  
  return(suppressWarnings(sort(c(tmp.top, tmp.bot))))
  
}
ATpoint/myRfunctions documentation built on April 24, 2020, 9:49 p.m.