#' Find best matches for a pattern among a set of sequences
#'
#' This function is a convenience wrapper for \code{\link{vmatchpattern}}.
#' It is slow but returns a lot of information.
#'
#' @param pattern a vector of patterns to look in seq.stringset for.
#' @param seq.stringset a biostrings stringSet object.
#' @param max.mismatch the maximum number of mismatches allowed in seq.stringset matches to pattern.
#@ @param ret.match logical, should the match be returned?
#' @keywords transcript Ensembl
#' @export
#' @import Biostrings
find.best.matches <- function(pattern, seq.stringset, max.mismatch=2, ret.match=T) {
## basic alignment of input
seq.stringset <- DNAStringSet(seq.stringset)
fbm.single <- function(pattern) {
## set up output object
out.df <- c(NA,NA,NA,NA)
## loop over the number of mismatches allowed in target sites
for (mm.int in 0:max.mismatch) {
## do the actual matching
match.pattern <- vmatchPattern(pattern=pattern, subject=seq.stringset, max.mismatch=mm.int)
## store the matches, if any, in out.df
for (i in 1:length(match.pattern)) {
l <- length(start(match.pattern[[i]]))
if (l>0) {
for (j in 1:l) {
out.df <- rbind(out.df,c(names(seq.stringset)[i],start(match.pattern[[i]])[j],end(match.pattern[[i]])[j],mm.int))
}
}
}
}
out.df <- na.omit(out.df)
## if any matches were found, format out.df
if (!is.null(nrow(out.df))) {
rownames(out.df) <- 1:nrow(out.df)
colnames(out.df) <- c("ID","Start","End","Mismatch")
out.df <- as.data.frame(out.df)
for (i in 2:ncol(out.df)) {
out.df[,i] <- as.numeric(as.character(out.df[,i]))
}
## remove multiple entries (a 0mm site will appear also as a 1mm site, etc...)
lt <- apply(out.df[,1:2],1,paste,collapse=":")
mults <- names(table(lt)[table(lt)>1])
while (length(mults)>0) {
pos <- c(1:length(lt))[lt==mults[1]]
pos <- pos[-which.min(out.df$Mismatch[pos])]
out.df <- out.df[-pos,]
lt <- apply(out.df[,1:2],1,paste,collapse=":")
mults <- names(table(lt)[table(lt)>1])
}
## reorder after gene/transcript ID
out.df <- out.df[order(out.df$ID),]
## locate mismatch position
if (ret.match) {
mm.pat <- NA
for (i in 1:nrow(out.df)) {
seq.sub <- substr(seq.stringset[names(seq.stringset)%in%out.df$ID[i]][[1]],
out.df$Start[i],out.df$End[i])
mm.pat <- c(mm.pat,as.character(seq.sub))
}
mm.pat <- na.omit(mm.pat)
} else {
mm.pat <- rep(NA,nrow(out.df))
}
out.df <- cbind(out.df, Match=mm.pat,Pattern=pattern)
} else {
out.df <- matrix(c(NA,NA,NA,NA,NA,pattern),ncol=6)
rownames(out.df) <- 1:nrow(out.df)
colnames(out.df) <- c("ID","Start","End","Mismatch","Match","Pattern")
}
return(out.df)
}
if (length(pattern)==1) {
out.df <- fbm.single(pattern=pattern)
} else {
out.df.list <- lapply(pattern,fbm.single)
out.df <- out.df.list[[1]]
for (i in 2:length(out.df.list)) {
out.df <- rbind(out.df,out.df.list[[i]])
}
}
return(out.df)
}
#' Count (mis)matches for a single pattern among a set of sequences
#'
#' This function is a convenience wrapper for \code{\link{vcountPattern}}.
#' It is fast but returns only counts.
#'
#' @param pattern a pattern to look in seq.stringset for.
#' @param seq.stringset a biostrings stringSet object.
#' @param max.mismatch the maximum number of mismatches allowed in seq.stringset matches to pattern.
#' @keywords count patterns
#' @export
#' @import Biostrings
count.singlepatt.multiseq <- function(pattern, seq.stringset, max.mismatch=2) {
m <- vector("list",max.mismatch+1)
names(m) <- max.mismatch:0
m[[as.character(max.mismatch)]] <- vcountPattern(pattern = pattern, subject = seq.stringset, max.mismatch = max.mismatch)
if (max.mismatch>0) {
for (mm in (max.mismatch-1):0) {
m[[as.character(mm)]] <- m[[as.character(mm+1)]]
tf <- m[[as.character(mm)]]>0
m[[as.character(mm)]][tf] <- vcountPattern(pattern = pattern, subject = seq.stringset[tf], max.mismatch = mm)
}
for (mm in max.mismatch:1) {
m[[as.character(mm)]][m[[as.character(mm-1)]]>0] <- m[[as.character(mm)]][m[[as.character(mm-1)]]>0] - m[[as.character(mm-1)]][m[[as.character(mm-1)]]>0]
}
}
m <- simplify2array(m)
m <- m[,ncol(m):1]
rownames(m) <- names(seq.stringset)
return(m)
}
#' Count (mis)matches for multiple patterns in a single sequence
#'
#' This function is a convenience wrapper for \code{\link{matchPDict}}.
#' It is fast (do parallel computation) but returns only counts.
#'
#' @param patterns a vector of patterns to look in seq.in for.
#' @param seq.in a biostrings string object.
#' @param max.mismatch the maximum number of mismatches allowed in seq.stringset matches to pattern.
#' @param parallel either True (default) or False, should counting be parallelized?
#' @keywords count patterns
#' @export
#' @import Biostrings
#' @import parallel
count.multipatt.singleseq <- function(patterns, seq.in, max.mismatch=2, parallel=T) {
suppressWarnings(dict <- PDict(patterns, max.mismatch=max.mismatch)) # create a dictionary of the patterns
md <- matchPDict(dict,subject=seq.in, max.mismatch=max.mismatch) # match this against the premrna.seq.in (fast)
seq.in.char <- as.character(seq.in) # it is faster to work on character vectors
count.mm.all <- function(i, patt.in) { # this function parses the output to md
md.in <- md[[i]]
if (length(md.in)>0) { # is there a hit?
xs <- substring(seq.in.char, start(md.in), end(md.in)) # slice out the hit
if (max(end(md.in))>nchar(seq.in.char)) { # check that it is within the sequence, otherwise add x'ses
ldiff <- end(md.in)-nchar(seq.in.char)
xs <- paste(xs,paste(rep("x",ldiff),collapse=""), sep="")
}
count.mm <- function(x,pattern) { #compare two sequences of equal length and return # mismatches
tf <- strsplit(as.character(x),"")[[1]] != strsplit(as.character(pattern),"")[[1]]
sum(tf)
}
mp <- sapply(xs, count.mm, pattern=patt.in)
mp.out <- table(factor(mp, levels=0:max.mismatch))
} else {
mp.out <- c("0"=0, "1"=0, "2"=0)
}
return(mp.out)
}
if (parallel) {
nc <- detectCores() # number of cores on computer
a <- t(mcmapply(count.mm.all, i=1:length(md), patt.in=patterns, mc.cores=nc))
} else {
a <- t(mapply(count.mm.all, i=1:length(md), patt.in=patterns))
}
rownames(a) <- patterns
return(a)
}
#' Cut a sequence into all possible (oligo-) target sites in a given range interval
#'
#' The output of this function is used as a starting point for TargetSurveys.
#'
#' @param seq a DNAString
#' @param max the maximal length
#' @param min the minimal length
#' @return character vector with all the target sites
enumerate.target.sites = function(seq, max=16, min=8){
d = data.frame()
for(i in min:max){
d = rbind(d,data.frame(
start = (i:width(seq)) - i+1,
end = i:width(seq),
length = i
)
)
}
d$target_seq <- substring(seq,d$start,d$end)
d$oligo_seq <- as.character(reverseComplement(DNAStringSet(d$target_seq)))
d$target_seq <- as.character(d$target_seq)
return(d)
}
#### Functions that takes a vector of target sites (char) and returns vector or data.frame of results in the same order
#' Find the occurrences of each of a character vector of sequences (e.g. target sites) in a DNAStringSet (e.g the transcriptome).
#'
#' This function is used by TargetSurveyor
#' This function wraps Biostring functions to find the occurrence of targets sites in a sequence database
#' @param target.sites a character vector of target sites, e.g. from the appropriate column of a target survey
#' @param seqdb a DNAStringSet. If ensembl=TRUE, the sequence names needs to be formatted in a special way for the summarize function to work
#' @param seqdb.name a character string to be used for generating column headers in the output
#' @param summarize if TRUE (default) the results are processed to count the unique transcripts, genes and symbols that are hit
#' @param the exact number of mismatches in each occurrence (defaults to 0)
#' @export
#' @import Biostrings
#' @return a data.frame with the same number of rows and ordering as target.sites
occurrences = function(target.sites, seqdb, seqdb.name="unnamed_seqdb", summarize=TRUE,
no.mismatch=0, ...){
vcountPDict2 <- function(dict, seqdb, no.mismatch) {
tst <- vwhichPDict(pdict=dict, subject=seqdb, min.mismatch=no.mismatch, max.mismatch=no.mismatch)
hits2 <- matrix(0,ncol=length(seqdb),nrow=length(sites.len))
for (i in 1:length(tst)) {
hits2[tst[[i]],i] <- 1
}
return(hits2)
}
lengths = nchar(target.sites)
uni.l <- sort(unique(lengths))
d=data.frame(length=lengths)
t=data.frame()
result=NULL
for(l in uni.l){
sites.len = target.sites[lengths==l]
dict = PDict(sites.len, max.mismatch=no.mismatch)
message(paste("Checking ", l, "-mers", sep=""))
hits = vcountPDict2(dict, seqdb, no.mismatch)
if(summarize){ # only the total number of hits in the whole seqdb is reported
message("Summarizing")
d[[paste(seqdb.name, "_sum", sep='')]][lengths==l]=apply(hits, 1, sum, ...)
}else{ # a column is added with hits for each individual sequence in seqdb. Note that the
#names of the seqs are not returned, as these usually needs to be parsed.
#Add them using colnames
t=rbind(t, as.data.frame(hits))
}
}
#RETURNS
if(summarize){
return(as.data.frame(d)[,c(2:ncol(d))])
}else{
#warning("This method assumes that the target sites are ordered by length")
return(t)
}
}
#' Counts the number of occurrences of a list of (short target site) sequences in a SINGLE other (transcript) sequence, allowing for mismatches
#'
#' @param target.sites character vector or DNAStringSet
#' @param seq.in bla
#' @param max.mismatch default=2
#' @import Biostrings
#' @import parallel
#' @export
occurrences.singleseq <- function(target.sites, seq.in, max.mismatch=2) {
nc <- detectCores() # number of cores on computer
patterns <- as.character(target.sites) # get the patterns to be matched # detect how many cores are available and use that
suppressWarnings(dict <- PDict(patterns, max.mismatch=max.mismatch)) # create a dictionary of the patterns
md <- matchPDict(dict,subject=seq.in, max.mismatch=max.mismatch) # match this against the premrna.seq.in (fast)
seq.in.char <- as.character(seq.in) # it is faster to work on character vectors
count.mm.all <- function(i, patt.in) { # this function parses the output to md
md.in <- md[[i]]
if (length(md.in)>0) { # is there a hit?
xs <- substring(seq.in.char, start(md.in), end(md.in)) # slice out the hit
if (max(end(md.in))>nchar(seq.in.char)) { # check that it is within the sequence, otherwise add x'ses
ldiff <- end(md.in)-nchar(seq.in.char)
xs <- paste(xs,paste(rep("x",ldiff),collapse=""), sep="")
}
count.mm <- function(x,pattern) { #compare two sequences of equal length and return # mismatches
tf <- strsplit(as.character(x),"")[[1]] != strsplit(as.character(pattern),"")[[1]]
sum(tf)
}
mp <- sapply(xs, count.mm, pattern=patt.in)
mp.out <- table(factor(mp, levels=0:max.mismatch))
} else {
mp.out <- c("0"=0, "1"=0, "2"=0)
}
return(mp.out)
}
a <- t(mcmapply(count.mm.all, i=1:length(md), patt.in=patterns, mc.cores=nc))
rownames(a) <- patterns
return(a)
}
#' Detects simple sequence patterns with regexp
#' @import Biostrings
#' @export
simple.patterns = function(oligos){
d = data.frame (CGs = vcountPattern("CG",oligos),
CGs_not_in_3bp_flanks = vcountPattern("CG", substr(oligos,4, width(oligos)-3)),
four_in_a_row = vcountPattern("AAAA", oligos) + vcountPattern("TTTT", oligos) + vcountPattern("GGGG", oligos) + vcountPattern("CCCC", oligos),
CTGT_count = vcountPattern("CTGT", oligos),
Five_prime_GGG = regexpr("^GGG",oligos)!=-1,
Five_prime_GG = regexpr("^GG",oligos)!=-1
)
return(d)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.