#' Find occurrences of a DNA sequence in a BSgenome
#'
#' Function takes a DNA sequence and then returns a GRanges object with the position
#' of that sequence (separated by strand) in a given BSgenome. Optionally one can
#' set a number of allowed mismatches. IUPAC ambiguity characters are supported.
#'
#' @param Sequence A DNA sequence
#' @param BSGenome a BSgenome object
#' @param StrictAmbiguity logical, how to deal with ambiguity characters, see Details.
#' @param MaxMismatches number of allowed mismatches
#' @param Cores number of cores for chromosome-wise search parallelization via mclapply
#'
#' @details
#' StrictAmbiguity => if TRUE (so strict) then an amb. character in the query, e.g. N is only counted as a match
#' if the reference contains the same character, so N in query must be N in reference. Else, character in reference
#' can be any of the encoded characters, so N can be A/T/C/G in the reference.
#'
#' @examples
#' library(BSgenome.Ecoli.NCBI.20080805)
#' FindSequenceOccurrence(Sequence = "AGCTAAATTTGG",
#' MaxMismatches = 1,
#' BSGenome = BSgenome.Ecoli.NCBI.20080805)
#'
#' @author Alexander Toenges
#'
#' @export
FindSequenceOccurrence <- function(Sequence,
BSGenome,
StrictAmbiguity = FALSE,
MaxMismatches = 0,
Cores=detectCores()){
require(Biostrings)
require(GenomicRanges)
options(scipen=999)
## remove whitespaces from input DNA sequence
Sequence <- DNAString( gsub(" ", "", as.character(Sequence)) )
## function to match sequence to genome:
SearchGeneric <- function(dnastring) {
## parallel over all chromosomes using mclapply
all.matches <- mclapply(1:length(seqnames(BSGenome)), function(x) {
## match string to chromosome using matchPattern
tmp.match <- matchPattern(dnastring, BSGenome[[x]],
max.mismatch = MaxMismatches, fixed = StrictAmbiguity)
## summarize match coordinates into GRanges format
if (length(tmp.match) > 0) {
tmp.gr <- GRanges(seqnames = seqnames(BSGenome)[x],
ranges = ranges(tmp.match))
elementMetadata(tmp.gr) <- data.frame(DNAStringSet(tmp.match))[,1]
return(tmp.gr)
}}, mc.cores = Cores)
if(!is.null(unlist(all.matches))) return(all.matches)
if(is.null(unlist(all.matches))) return(NULL)
}
message("Searching top strand")
topstr <- SearchGeneric(Sequence)
message("Searching bottom strand")
botstr <- SearchGeneric(reverseComplement(Sequence))
## if no matches were found
if(is.null(topstr) && is.null(botstr)) {
message("No matches found on either strand")
return(NULL)
}
## combine output for all chromosomes
if(!is.null(topstr)){
tmp.top <- suppressWarnings(do.call("c", unlist(topstr)))
strand(tmp.top) <- "+"
} else tmp.top <- GRanges()
if(!is.null(botstr)){
tmp.bot <- suppressWarnings(do.call("c", unlist(botstr)))
strand(tmp.bot) <- "-"
} else tmp.bot <- GRanges()
return(suppressWarnings(sort(c(tmp.top, tmp.bot))))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.