R/seq_match_count.r

#' Find best matches for a pattern among a set of sequences
#'
#' This function is a convenience wrapper for \code{\link{vmatchpattern}}.
#' It is slow but returns a lot of information.
#'
#' @param pattern a vector of patterns to look in seq.stringset for.
#' @param seq.stringset a biostrings stringSet object.
#' @param max.mismatch the maximum number of mismatches allowed in seq.stringset matches to pattern.
#@ @param ret.match logical, should the match be returned?
#' @keywords transcript Ensembl
#' @export
#' @import Biostrings
find.best.matches <- function(pattern, seq.stringset, max.mismatch=2, ret.match=T) {
  ## basic alignment of input
  seq.stringset <- DNAStringSet(seq.stringset)
  
  fbm.single <- function(pattern) {
    ## set up output object
    out.df <- c(NA,NA,NA,NA)
    
    ## loop over the number of mismatches allowed in target sites
    for (mm.int in 0:max.mismatch) {
      ## do the actual matching
      match.pattern <- vmatchPattern(pattern=pattern, subject=seq.stringset, max.mismatch=mm.int)
      
      ## store the matches, if any, in out.df
      for (i in 1:length(match.pattern)) {
        l <- length(start(match.pattern[[i]]))
        if (l>0) {
          for (j in 1:l) {
            out.df <- rbind(out.df,c(names(seq.stringset)[i],start(match.pattern[[i]])[j],end(match.pattern[[i]])[j],mm.int))
          }
        }
      }
    }
    out.df <- na.omit(out.df)
    
    ## if any matches were found, format out.df
    if (!is.null(nrow(out.df))) {
      rownames(out.df) <- 1:nrow(out.df)
      colnames(out.df) <- c("ID","Start","End","Mismatch")
      out.df <- as.data.frame(out.df)
      for (i in 2:ncol(out.df)) {
        out.df[,i] <- as.numeric(as.character(out.df[,i]))
      }
      ## remove multiple entries (a 0mm site will appear also as a 1mm site, etc...)
      lt <- apply(out.df[,1:2],1,paste,collapse=":")
      mults <- names(table(lt)[table(lt)>1])
      while (length(mults)>0) {
        pos <- c(1:length(lt))[lt==mults[1]]
        pos <- pos[-which.min(out.df$Mismatch[pos])]
        out.df <- out.df[-pos,]
        lt <- apply(out.df[,1:2],1,paste,collapse=":")
        mults <- names(table(lt)[table(lt)>1])
      }
      ## reorder after gene/transcript ID
      out.df <- out.df[order(out.df$ID),]
      
      ## locate mismatch position
      if (ret.match) {
        mm.pat <- NA
        for (i in 1:nrow(out.df)) {
          seq.sub <- substr(seq.stringset[names(seq.stringset)%in%out.df$ID[i]][[1]],
                            out.df$Start[i],out.df$End[i])
          mm.pat <- c(mm.pat,as.character(seq.sub))
        }
        mm.pat <- na.omit(mm.pat)
      } else {
        mm.pat <- rep(NA,nrow(out.df))
      }
      out.df <- cbind(out.df, Match=mm.pat,Pattern=pattern)
    } else {
      out.df <- matrix(c(NA,NA,NA,NA,NA,pattern),ncol=6)
      rownames(out.df) <- 1:nrow(out.df)
      colnames(out.df) <- c("ID","Start","End","Mismatch","Match","Pattern")
    }
    
    return(out.df)
  }
  
  if (length(pattern)==1) {
    out.df <- fbm.single(pattern=pattern)
  } else {
    out.df.list <- lapply(pattern,fbm.single)
    out.df <- out.df.list[[1]] 
    for (i in 2:length(out.df.list)) {
      out.df <- rbind(out.df,out.df.list[[i]])
    }
  }
  return(out.df)
}

#' Count (mis)matches for a single pattern among a set of sequences
#'
#' This function is a convenience wrapper for \code{\link{vcountPattern}}.
#' It is fast but returns only counts.
#'
#' @param pattern a pattern to look in seq.stringset for.
#' @param seq.stringset a biostrings stringSet object.
#' @param max.mismatch the maximum number of mismatches allowed in seq.stringset matches to pattern.
#' @keywords count patterns
#' @export
#' @import Biostrings
count.singlepatt.multiseq <- function(pattern, seq.stringset, max.mismatch=2) {
  m <- vector("list",max.mismatch+1)
  names(m) <- max.mismatch:0
  m[[as.character(max.mismatch)]] <- vcountPattern(pattern = pattern, subject = seq.stringset, max.mismatch = max.mismatch)
  if (max.mismatch>0) {
    for (mm in (max.mismatch-1):0) {
      m[[as.character(mm)]] <- m[[as.character(mm+1)]] 
      tf <- m[[as.character(mm)]]>0
      m[[as.character(mm)]][tf] <- vcountPattern(pattern = pattern, subject = seq.stringset[tf], max.mismatch = mm)
    }
    for (mm in max.mismatch:1) {
      m[[as.character(mm)]][m[[as.character(mm-1)]]>0] <- m[[as.character(mm)]][m[[as.character(mm-1)]]>0] - m[[as.character(mm-1)]][m[[as.character(mm-1)]]>0]
    }
  }
  m <- simplify2array(m)
  m <- m[,ncol(m):1]
  rownames(m) <- names(seq.stringset)
  return(m)
}

#' Count (mis)matches for multiple patterns in a single sequence
#'
#' This function is a convenience wrapper for \code{\link{matchPDict}}.
#' It is fast (do parallel computation) but returns only counts.
#'
#' @param patterns a vector of patterns to look in seq.in for.
#' @param seq.in a biostrings string object.
#' @param max.mismatch the maximum number of mismatches allowed in seq.stringset matches to pattern.
#' @param parallel either True (default) or False, should counting be parallelized?
#' @keywords count patterns
#' @export
#' @import Biostrings
#' @import parallel
count.multipatt.singleseq <- function(patterns, seq.in, max.mismatch=2, parallel=T) {
  suppressWarnings(dict <- PDict(patterns, max.mismatch=max.mismatch))			# create a dictionary of the patterns
  md <- matchPDict(dict,subject=seq.in, max.mismatch=max.mismatch)				# match this against the premrna.seq.in (fast)
  seq.in.char <- as.character(seq.in)												# it is faster to work on character vectors
  count.mm.all <- function(i, patt.in) {											# this function parses the output to md
    md.in <- md[[i]]
    if (length(md.in)>0) {														# is there a hit?
      xs <- substring(seq.in.char, start(md.in), end(md.in))					# slice out the hit
      if (max(end(md.in))>nchar(seq.in.char)) {								# check that it is within the sequence, otherwise add x'ses
        ldiff <- end(md.in)-nchar(seq.in.char)
        xs <- paste(xs,paste(rep("x",ldiff),collapse=""), sep="")
      }
      count.mm <- function(x,pattern) {										#compare two sequences of equal length and return # mismatches
        tf <- strsplit(as.character(x),"")[[1]] != strsplit(as.character(pattern),"")[[1]]
        sum(tf)
      }
      mp <- sapply(xs, count.mm, pattern=patt.in)
      mp.out <- table(factor(mp, levels=0:max.mismatch))
    } else {
      mp.out <- c("0"=0, "1"=0, "2"=0)
    }
    return(mp.out)
  }
  if (parallel) {
    nc <- detectCores()																# number of cores on computer
    a <- t(mcmapply(count.mm.all, i=1:length(md), patt.in=patterns, mc.cores=nc))		
  } else {
    a <- t(mapply(count.mm.all, i=1:length(md), patt.in=patterns))
  }
  rownames(a) <- patterns
  return(a)
}


#' Cut a sequence into all possible (oligo-) target sites in a given range interval
#' 
#' The output of this function is used as a starting point for TargetSurveys.
#' 
#' @param seq a DNAString
#' @param max the maximal length
#' @param min the minimal length
#' @return character vector with all the target sites
enumerate.target.sites = function(seq, max=16, min=8){
  d = data.frame()
  for(i in min:max){
    d = rbind(d,data.frame(
      start = (i:width(seq)) - i+1,
      end = i:width(seq),
      length = i
    )
    )
  }
  d$target_seq <- substring(seq,d$start,d$end)
  d$oligo_seq <- as.character(reverseComplement(DNAStringSet(d$target_seq)))
  d$target_seq <- as.character(d$target_seq)
  return(d)    
}

#### Functions that takes a vector of target sites (char) and returns vector or data.frame of results in the same order
#' Find the occurrences of each of a character vector of sequences (e.g. target sites) in a DNAStringSet (e.g the transcriptome).
#' 
#' This function is used by TargetSurveyor
#' This function wraps Biostring functions to find the occurrence of targets sites in a sequence database
#' @param target.sites a character vector of target sites, e.g. from the appropriate column of a target survey
#' @param seqdb a DNAStringSet. If ensembl=TRUE, the sequence names needs to be formatted in a special way for the summarize function to work
#' @param seqdb.name a character string to be used for generating column headers in the output
#' @param summarize if TRUE (default) the results are processed to count the unique transcripts, genes and symbols that are hit
#' @param the exact number of mismatches in each occurrence (defaults to 0)
#' @export
#' @import Biostrings
#' @return a data.frame with the same number of rows and ordering as target.sites
occurrences = function(target.sites, seqdb, seqdb.name="unnamed_seqdb", summarize=TRUE, 
                       no.mismatch=0, ...){
  vcountPDict2 <- function(dict, seqdb, no.mismatch) {
		tst <- vwhichPDict(pdict=dict, subject=seqdb, min.mismatch=no.mismatch, max.mismatch=no.mismatch)
		hits2 <- matrix(0,ncol=length(seqdb),nrow=length(sites.len))
		for (i in 1:length(tst)) {
			hits2[tst[[i]],i] <- 1
		}
		return(hits2)
  }

  lengths = nchar(target.sites)
  uni.l <- sort(unique(lengths))
  d=data.frame(length=lengths)
  t=data.frame()
  result=NULL
  for(l in uni.l){
    sites.len = target.sites[lengths==l]
    
    dict = PDict(sites.len, max.mismatch=no.mismatch)
    message(paste("Checking ", l, "-mers", sep=""))
    hits = vcountPDict2(dict, seqdb, no.mismatch)  
    
    if(summarize){ # only the total number of hits in the whole seqdb is reported
      message("Summarizing")
      d[[paste(seqdb.name, "_sum", sep='')]][lengths==l]=apply(hits, 1, sum, ...) 
    }else{ # a column is added with hits for each individual sequence in seqdb. Note that the 
           #names of the seqs are not returned, as these usually needs to be parsed. 
           #Add them using colnames
      t=rbind(t, as.data.frame(hits))
    }
  } 
  #RETURNS
  if(summarize){
      return(as.data.frame(d)[,c(2:ncol(d))])
  }else{
    #warning("This method assumes that the target sites are ordered by length")
    return(t)
  }
}



#' Counts the number of occurrences of a list of (short target site) sequences in a SINGLE other (transcript) sequence, allowing for mismatches
#'
#' @param target.sites character vector or DNAStringSet
#' @param seq.in bla
#' @param max.mismatch default=2
#' @import Biostrings
#' @import parallel
#' @export
occurrences.singleseq <- function(target.sites, seq.in, max.mismatch=2) {
  nc <- detectCores()																# number of cores on computer	
  patterns <- as.character(target.sites)	# get the patterns to be matched															# detect how many cores are available and use that
  suppressWarnings(dict <- PDict(patterns, max.mismatch=max.mismatch))			# create a dictionary of the patterns
  md <- matchPDict(dict,subject=seq.in, max.mismatch=max.mismatch)				# match this against the premrna.seq.in (fast)
  seq.in.char <- as.character(seq.in)												# it is faster to work on character vectors
  count.mm.all <- function(i, patt.in) {											# this function parses the output to md
    md.in <- md[[i]]
    if (length(md.in)>0) {														# is there a hit?
      xs <- substring(seq.in.char, start(md.in), end(md.in))					# slice out the hit
      if (max(end(md.in))>nchar(seq.in.char)) {								# check that it is within the sequence, otherwise add x'ses
        ldiff <- end(md.in)-nchar(seq.in.char)
        xs <- paste(xs,paste(rep("x",ldiff),collapse=""), sep="")
      }
      count.mm <- function(x,pattern) {										#compare two sequences of equal length and return # mismatches
        tf <- strsplit(as.character(x),"")[[1]] != strsplit(as.character(pattern),"")[[1]]
        sum(tf)
      }
      mp <- sapply(xs, count.mm, pattern=patt.in)
      mp.out <- table(factor(mp, levels=0:max.mismatch))
    } else {
      mp.out <- c("0"=0, "1"=0, "2"=0)
    }
    return(mp.out)
  }
  a <- t(mcmapply(count.mm.all, i=1:length(md), patt.in=patterns, mc.cores=nc))		
  rownames(a) <- patterns
  return(a)
}


#' Detects simple sequence patterns with regexp
#' @import Biostrings
#' @export
simple.patterns = function(oligos){
  d = data.frame (CGs = vcountPattern("CG",oligos),
                  CGs_not_in_3bp_flanks = vcountPattern("CG", substr(oligos,4, width(oligos)-3)),
                  four_in_a_row = vcountPattern("AAAA", oligos) + vcountPattern("TTTT", oligos) + vcountPattern("GGGG", oligos) + vcountPattern("CCCC", oligos),
                  CTGT_count = vcountPattern("CTGT", oligos),
                  Five_prime_GGG = regexpr("^GGG",oligos)!=-1,
                  Five_prime_GG = regexpr("^GG",oligos)!=-1
  )
  return(d)
}
Santaris/seqtools documentation built on May 9, 2019, 12:44 p.m.