R/ampCounter.R

#' Functions to predict the amplification generated by a strand displacement amplification
#'
#' Given a list of forward and reverse primer binding sites predict the amplification that will result from a strand displacement reaction
#'
#' The main functions are:
#'      \describe{
#'        \item{\code{\link{countAmplifications}}:}{to predict the number of amplifications for a single region}
#'        \item{\code{\link{predictAmplifications}}:}{to predict the number of amplifications across a whole genome}
#'        \item{\code{\link{enumerateAmplifications}}:}{to generate the predicted fragments for small sets of primers}
#'      }
##'
#' @docType package
#' @name ampcountr
#' @references \url{https://en.wikipedia.org/wiki/Multiple_displacement_amplification}
#' @author Scott Sherrill-Mix, \email{shescott@@upenn.edu}
#' @examples
#' forwards<-ampcountr:::generateRandomPrimers(100000,1000)
#' reverses<-ampcountr:::generateRandomPrimers(100000,1000)
#' amps<-predictAmplifications(forwards,reverses,maxLength=10000)
#' plot(1,1,type='n',xlim=c(1,100000),ylim=c(1,max(amps)),
#'      xlab='Position',ylab='Amplifications',log='y')
#' segments(amps$start,amps$amplification,amps$end,amps$amplification)
#' segments(amps$end[-nrow(amps)],amps$amplification[-nrow(amps)],
#'          amps$start[-1],amps$amplification[-1])
#' abline(v=forwards,col='#FF000044',lty=2)
#' abline(v=reverses,col='#0000FF44',lty=2)
#'
#' frags<-enumerateAmplifications(c(10,20,30),c(15,25,35),maxLength=40)
#' plotFrags(frags)
NULL

#' Enumerate the multiple strand displacement fragments generated given a set of primers
#' 
#' Take two vectors of forward and reverse primer binding locations
#' generate a list of amplification products predicted to be
#' generated by multiple strand displacement amplification.
#'
#' @param forwards Sorted positions of the start of the primers landing sites on the forward strand of the target sequence (5'-most base in the genome plus strand)
#' @param reverses Sorted positions of the end of the primers landing sites on the reverse strand of the target sequence (3'-most base in the genome plus strand)
#' @param strand "+" or "-" for indicating what strand the target sequence originates from. Note if starting with double stranded fragments then the function should be run once with "+" and once with "-" and the results concatenated.
#' @param maxLength The expected max length fragment the polymerase can generate in one run.
#' @param minLength Discard fragments shorter than minLength (can help with run time if many short fragments are generated)
#' @param maxTotalLength Discard fragments for which the sum of all the ancestors bases are longer than maxTotalLength. If the polymerase has some probability of falling off each base amplified then it becomes less and less likely to reach a deeply amplified product. Using maxTotalLength to end the recursion once a certain total length is reach can help with run time.
#' @param vocal TRUE or FALSE Output progress?
#' @param fragmentStart The 5' end of the current fragment (for use in recursion more than manually)
#' @param fragmentEnd The 3' end of the current fragment (for use in recursion more than manually)
#' @param baseName Prefix for tracking fragment ancestors (for use in recursion more than manually)
#' @param previousLength How many bases need to be amplified to reach this fragment (for use in recursion more than manually)
#' @param fullGenome If TRUE then do not add the number of bases necessary to reach the primers to the previousLength (for use in recursion more than manually)
#'
#' @return A data frame with columns start, end, strand, name, previousLength and length. Start and end and strand are the positions of 5' and 3' end on the appropriate strand of the target sequence. Name is a concatenation of the ancestors of that fragment. Previous length is the sum of the fragment lengths of the ancestors of this fragment. Length is the width of this fragment. Returns NULL if no amplifications.
#'
#' @concept multiple strand displacement amplification fragment prediction recursive recursion #'
#' @references \url{https://en.wikipedia.org/wiki/Multiple_displacement_amplification}
#' @seealso \code{\link{countAmplifications}}
#'
#' @export
#' 
#' @examples
#' enumerateAmplifications(c(10,20,30),c(40,50,60),maxLength=45)
enumerateAmplifications<-function(forwards,reverses,strand='+',maxLength=50000,minLength=1,maxTotalLength=Inf,fragmentStart=1,fragmentEnd=Inf,baseName='',vocal=FALSE,previousLength=0,fullGenome=TRUE){
  if(vocal&&runif(1)<.001)cat('.')
  if(vocal&&runif(1)<.0001)cat(sprintf(' %d ',fragmentStart))
  if(any(diff(forwards)<1))forwards<-sort(forwards)
  if(any(diff(reverses)<1))reverses<-sort(reverses)
  if(strand=='+'){
    thisStarts<-forwards[forwards>=fragmentStart&forwards<=fragmentEnd] 
    if(length(thisStarts)==0)return(NULL)
    thisEnds<-thisStarts+maxLength-1
    thisEnds[thisEnds>=fragmentEnd]<-fragmentEnd
    isTerminal<-diff(c(-Inf,thisStarts))+1>maxLength
    thisRequiredBases<-fragmentEnd-thisStarts+1
  }else{
    thisEnds<-reverses[reverses>=fragmentStart&reverses<=fragmentEnd] 
    if(length(thisEnds)==0)return(NULL)
    thisStarts<-thisEnds-maxLength+1
    thisStarts[thisStarts<=fragmentStart]<-fragmentStart
    isTerminal<-diff(c(thisEnds,Inf))+1> maxLength
    thisRequiredBases<-thisEnds-fragmentStart+1
  }
  if(fullGenome)thisRequiredBases<-0
  
  nFrags<-length(thisStarts)
  nDigits<-ceiling(log10(nFrags+1))
  sprintfPattern<-sprintf('%%s%s%%0%dd',ifelse(baseName=='','','_'),nDigits)
  out<-data.frame(
    'start'=thisStarts,
    'end'=thisEnds,
    'strand'=strand,
    'name'=sprintf(sprintfPattern,baseName,1:nFrags),
    'previousLength'=previousLength+thisRequiredBases,
    'length'=thisEnds-thisStarts+1,
    stringsAsFactors=FALSE
  )
  #filter out shorts
  out<-out[out$length>=minLength,]
  #terminate fragments with too many amplifications to have much weight
  isTerminal<-isTerminal|out$previousLength+out$length>=maxTotalLength
  if(any(!isTerminal)){
    daughters<-do.call(rbind,mapply(function(start,end,name,previousLength){
      enumerateAmplifications(forwards, reverses, strand=ifelse(strand=='+','-','+'), start, end, maxLength=maxLength, baseName=name,vocal=vocal,previousLength=previousLength,maxTotalLength=maxTotalLength,minLength=minLength,fullGenome=FALSE)
    },
    out[!isTerminal,'start'],out[!isTerminal,'end'],out[!isTerminal,'name'],out[!isTerminal,'previousLength'],SIMPLIFY=FALSE))
    out<-rbind(out,daughters)
  }
  return(out)
}

#' Plot fragments generated from multiple strand displacement
#' 
#' Take a data.frame with columns start, end, strand  (e.g. the output of \code{\link{enumerateAmplifications}}) and plot
#' an arrow representation of the fragments to the current device.
#'
#' @param frags A data.frame with columns start, end, strand  (and name if label is TRUE) e.g. the output of \code{\link{enumerateAmplifications}}
#' @param label TRUE or FALSE if fragments should be labeled with name from the name column of frags
#'
#' @return NULL. Draws plot to current device
#'
#' @concept multiple strand displacement amplification fragment plot arrows 
#'
#' @seealso \code{\link{enumerateAmplifications}}
#'
#' @export
#' 
#' @examples
#' frags<-enumerateAmplifications(c(10,20,30),c(40,50,60))
#' plotFrags(frags)
plotFrags<-function(frags,label=TRUE){
  nFrags<-ifelse(is.null(frags),0,nrow(frags))
  xlim<-c(min(frags$start,Inf),max(frags$end,-Inf))
  if(xlim[1]==Inf)xlim[1]<-1
  if(xlim[2]==-Inf)xlim[2]<-xlim[1]+1
  plot(1,1,type='n',xlim=xlim,ylim=c(1,nFrags)+c(-.5,.5),xlab='Genome position (nt)',ylab='Fragment',yaxs='i')
  if(nFrags>0)arrows(ifelse(frags$strand=='+',frags$start,frags$end),1:nFrags,ifelse(frags$strand=='+',frags$end,frags$start),1:nFrags,length=.02)
  if(label&nFrags>0)text(apply(frags[,c('start','end')],1,mean),1:nFrags,frags$name,col='#00000077',adj=c(0.5,0),cex=.6)
  return(NULL)
}

#' Generate random primers for testing
#' 
#' Generate random position for primers. Mostly for internal testing and examples
#'
#' @param genomeSize Max position for primers
#' @param frequency About how much space between primers. Function will generate approximately genomeSize/frequency primers
#'
#' @return sorted list of positions
#'
#' @keywords internal
#'
#' @examples
#' ampcountr:::generateRandomPrimers(10000,100)
generateRandomPrimers<-function(genomeSize,frequency){
  nPrimers<-round(genomeSize/frequency)
  out<-unique(sort(ceiling(runif(nPrimers,1,genomeSize))))
  return(out)
}

#' Fast calculation of expected multiple strand displacement amplification
#' 
#' Uses dynamic programing to build a matrix of the expected amplifications for positions with a given a number 
#' of forward primers upstream and reverse primers downstream and within range of the polymerase.
#'
#' @param nForwards Calculate a matrix from zero to this many forward primers
#' @param nReverses Calculate a matrix from zero to this many reverse primers
#'
#' @return A nForwards+1 row, nReverses+1 column matrix of the expected amplifications for each combination of numbers
#' of forward and reverse primers. Row 1 is for 0 forward primers within range, row 2 is for 1 forward primer,
#' column 1 is for 0 reverse primers, column 2 is for 1 reverse primers ...
#'
#' @concept multiple strand displacement amplification fragment prediction dynamic programming
#'
#' @seealso \code{\link{enumerateAmplifications}}
#'
#' @export
#' 
#' @examples
#' generateAmplificationTable(20,20)
generateAmplificationTable<-function(nForwards=10,nReverses=10){
  out<-matrix(NA,nrow=nForwards+1,ncol=nReverses+1)
  rownames(out)<-0:nForwards
  colnames(out)<-0:nReverses
  out['0',]<-0
  out['1',]<-1
  out[,'0']<-0:nForwards
  out[1+1:nForwards,'1']<-out[1+1:nForwards,'0']*2-1
  for(ii in 1+2:nForwards){
    for(jj in 1+2:nReverses){
      out[ii,jj]<-out[ii-1,jj]+out[ii,jj-1]+1
    }
  }
  return(out)
}

#' Calculate expected multiple strand displacement amplification using lookup table
#' 
#' Uses the amplificationLookup table stored in the package data to predict amplification based on the number of forward and reverse primers spanning a region. To keep the lookup table small, I limited this to 1000 forward and reverse primers but it could be extended easily. 
#'
#' @param nForwards Calculate the expected amplifcations for a region with nForwards primers 5' on the correct strand and within range
#' @param nReverses Calculate the expected amplifcations for a region with nReverses primers 3' on the correct strand and within range
#' @param isTerminal If TRUE then there are no upstream forward primers to release the 5'-most forward primer in this set. If FALSE then the 5'-most forward primer is assumed to be released by an unknown upstream primer.
#' @param onlyFirstPrimer If TRUE then count only fragments whose initial fragment initiating from the 5'-most forward primer (mostly for internal usage to allow individual counting of sets of primers where only subsets overlap)
#'
#' @return The number of expected amplifications
#'
#' @concept multiple strand displacement amplification fragment prediction dynamic programming
#'
#' @export
#' 
#' @examples
#' countAmplifications(20,20)
#' countAmplifications(0,20)
#' countAmplifications(20,0)
countAmplifications<-function(nForwards,nReverses,isTerminal=TRUE,onlyFirstPrimer=FALSE){
  if(nForwards>300)stop(simpleError(sprintf('To avoid a large lookup table countAmplifications limited to less than 300 forward primers. Maybe use generateAmplificationTable(%d,%d) directly',nForwards,nReverses)))
  if(nReverses>300)stop(simpleError(sprintf('To avoid a large lookup table countAmplifications limited to less than 300 reverse primers. Maybe use generateAmplificationTable(%d,%d) directly',nForwards,nReverses)))
  if(onlyFirstPrimer) return(countAmplifications(nForwards,nReverses,isTerminal)-countAmplifications(max(0,nForwards-1),nReverses,FALSE))
  if(isTerminal) return(ampcountr::amplificationLookup[nForwards+1,nReverses+1])
  else return(ampcountr::amplificationLookup[nForwards+1+1,nReverses+1]-1)
}

#' Calculate expected multiple strand displacement for a series of forwards and reverse primers
#' 
#' Calculate the expected amplifications across a genome for a set of forward and reverse primer binding sites used in multiple strand displacement amplification
#'
#' @param forwards Leftmost positions of primer binding sites landing on the 5' strand (1-based 5'-most base in the genome plus strand)
#' @param reverses Rightmost positions of primer binding sites landing on the 3' strand (1-based 3'-most base in the genome plus strand)
#' @param maxLength Maximum length fragment generated by polymerase
#' @param genomeSize Maximum position possible for amplification
#'
#' @return A data frame with columns: 
#'      \describe{
#'        \item{start:}{1-based start coordinate of region}
#'        \item{end:}{1-based end coordinate of region}
#'        \item{forwards:}{number of forward primers spanning the region (for predictAmplificationsSingleStrand)}
#'        \item{reverses:}{number of reverse primers spanning the region (for predictAmplificationsSingleStrand)}
#'        \item{amplifications:}{expected amplifcations for the region}
#'      }
#'
#' @concept multiple strand displacement amplification fragment prediction dynamic programming genome
#'
#' @export
#' 
#' @examples
#' predictAmplificationsSingleStrand(c(1,10,20),c(15,25,35),maxLength=40,genomeSize=45)
#' predictAmplifications(c(1,10,20),c(15,25,35),maxLength=40,genomeSize=45)
predictAmplificationsSingleStrand<-function(forwards,reverses,maxLength=30000,genomeSize=max(c(forwards+maxLength-1,reverses))){
  if(length(forwards)>0)forwards<-sort(unique(forwards))
  if(length(reverses)>0)reverses<-sort(unique(reverses))
  inRangeReverses<-lapply(forwards,function(x)reverses[reverses>=x&reverses-x<maxLength])
  #forwards+maxLength not -1 because the drop should be on the base after last base of amplification
  pos<-sort(c(forwards,forwards+maxLength,reverses-maxLength+1,reverses+1))
  out<-data.frame(
    'start'=pos[-length(pos)],
    'end'=pos[-1]-1
  )
  #fix where reverse primer ended and forward primer started at same pos
  out$end<-apply(out[,c('start','end')],1,max)
  #keep the last row when multiple events on single position (cumsums last contains completed counts for both)
  out<-out[c(out$start[-nrow(out)]!=out$start[-1],TRUE),]
  out<-out[out$end>=1&out$start<=genomeSize,]
  #keep within genome boundaries
  out[1,'start']<-max(1,out[1,'start'])
  out[nrow(out),'end']<-min(genomeSize,out[nrow(out),'end'])
  regionForwards<-lapply(out$start,function(x)inRangeReverses[forwards<=x&forwards>=x-maxLength+1])
  reverseCounts<-mapply(function(forwards,end)sapply(forwards,function(x)sum(x>=end)),regionForwards,out$end,SIMPLIFY=FALSE)
  out$amplifications<-sapply(reverseCounts,function(reverseCount){
	  if(length(reverseCount)==0)return(0)
	  isTerminal<-c(TRUE,rep(FALSE,length(reverseCount)-1))
	  return(sum(mapply(countAmplifications,length(reverseCount):1,reverseCount,isTerminal,TRUE)))
  })
  return(out)
}

#' @describeIn predictAmplificationsSingleStrand Calculate the expected amplifications across a genome for a set of forward and reverse primer binding sites for both strands
#' @export
predictAmplifications<-function(forwards,reverses,maxLength=30000,genomeSize=max(c(forwards+maxLength-1,reverses))){
  forwardPred<-predictAmplificationsSingleStrand(forwards,reverses,maxLength,genomeSize)
  reversePred<-predictAmplificationsSingleStrand(genomeSize-reverses+1,genomeSize-forwards+1,maxLength,genomeSize)
  tmp<-reversePred$start
  reversePred$start<-genomeSize-reversePred$end+1
  reversePred$end<-genomeSize-tmp+1
  reversePred<-reversePred[nrow(reversePred):1,]
  if(any(reversePred$start!=forwardPred$start|reversePred$end!=reversePred$end))stop(simpleError('Mismatch between forward and reverse predictions'))
  out<-forwardPred[,c('start','end','amplifications')]
  out$amplifications<-out$amplifications+reversePred$amplifications
  return(out)
}


#' Lookup table for predicted amplifcation 
#'
#' A 301x301 matrix of predicted amplification for regions spanned by 0-300 primers
#'
#' @docType data
#' @format A 301 row, 301 column matrix of the expected amplifications for each combination of 0-300
#' forward and reverse primers spanning a region. Row 1 is for 0 forward primers within range, row 2 is for 1 forward primer,
#' column 1 is for 0 reverse primers, column 2 is for 1 reverse primers ...
#' @source generateAmplificationTable(300,300)
"amplificationLookup"





#enumerateAmplificationsSingleStrand<-function(forwards,reverses,maxLength=30000,genomeSize=max(c(forwards+maxLength-1,reverses))){
#  forwards<-unique(forwards)
#  reverses<-unique(reverses)
#  nForwards<-length(forwards)
#  nReverses<-length(reverses)
#  primers<-data.frame(
#    'start'=c(forwards,reverses-maxLength+1),
#    'end'=c(forwards+maxLength,reverses),
#    'type'=rep(c('forward','reverse'),c(nForwards,nReverses))
#  )
#  interesting<-data.frame(
#    'pos'=c(primers$start,primers$end),
#    'id'=c(1:nrow(primers),1:nrow(primers)),
#    'isStart'=rep(c(TRUE,FALSE),c(nrow(primers),nrow(primers)))
#  )
#  interesting<-interesting[order(interesting$pos),]
#  
#  currentForwardFrags<-outFrags<-currentReverseFrags<-data.frame('start'=0,'end'=0,'last'=TRUE)[0,]
#  activeForwards<-c()
#  activeReverses<-c()
#  for(ii in 1:nrow(interesting)){
#    thisId<-interesting[ii,'id']
#    thisPrimer<-primers[thisId,]
#    if(interesting[ii,'type']=='forward'){
#      if(thisPrimer$type=='forward')activeForwards<-c(activeForwards,thisId)
#      else activeReverses<-c(activeForwards,thisId)
#      next() #removed from active, now done with this primer
#    }
#      #else nothing since no forwards
#    #if(thisPrimer$type=='forward')currentForwardFrags<-rbind(currentForwardFrags,data.frame('start'=thisPrimer$start,'end'=thisPrimer$end,'last'=TRUE))
#    #}else{
#      #if(thisPrimer$type=='forward')currentForwardFrags<-rbind(currentForwardFrags,data.frame('start'=thisPrimer$start,'end'=thisPrimer$end,'last'=TRUE))
#      
#    #}
#
#    if(thisPrimer$type=='forward')activeForwards<-activeForwards[activeForwards!=thisId]
#    else activeReverses<-activeReverses[activeReverses!=thisId]
#    print(activeForwards)
#    print(activeReverses)
#  }
#}
sherrillmix/ampCountR documentation built on May 29, 2019, 9:24 p.m.