R/BCRANK.R

Defines functions matchingSites bcrank bcrankRun getBCRANKScore getBCRANKNeighbours hitsForNeighboursInC matchConsensus createPWM seqFromFile addNs trimNs trimWhiteSpace revComp iupacToCons

Documented in bcrank matchingSites

###############################################################################
##
## File: BCRANK.R
##
## predicting binding site consensus from ranked DNA sequences
##
## Author: Adam Ameur, Uppsala Univ., Sweden
##
## Description: Implementation of an algorithm for detection of short DNA
##   sequences that are overrepresented in some part of the list. Starting from
##   some initial consensus DNA sequence coded in IUPAC symbols, the method uses
##   a heuristic search to improve the consensus until a local optimum is found.
## 
## Dependencies: Biostrings
##
################################################################################

####################### Help functions ######################### 

## IUPAC symbols for nucleotides
iupac <- c("N","A","C","G","T","R","Y","K","M","S","W","B","D","H","V")
names(iupac) <- c("ACGT","A","C","G","T","AG","CT","GT","AC","CG","AT","CGT","AGT","ACT","ACG")

## IUPAC symbols represented as a list
iupacAsList <- list("A"=c("A"),"C"=c("C"),"G"=c("G"),"T"=c("T"),"R"=c("A","G"),"Y"=c("C","T"),"K"=c("G","T"),"M"=c("A","C"),"S"=c("C","G"),"W"=c("A","T"),"B"=c("C","G","T"),"D"=c("A","G","T"),"H"=c("A","C","T"),"V"=c("A","C","G"),"N"=c("A","C","G","T"))

## Converts a string of IUPAC symbols to a consensus sequence
iupacToCons <- function(iupacString){

  iupacToBase <- function(base){
    if(base == "A" || base == "C" || base == "G" || base == "T")
      return(base)
    else{
      cons <- names(iupac[which(iupac == base)])
      return(paste("[",cons,"]",sep=""))
    }
  }
  
  iupacByPos <- strsplit(iupacString,"")[[1]]
  cons <- ""
  for(i in 1:length(iupacByPos)){
    cons <- paste(cons, iupacToBase(iupacByPos[i]),sep="")
  }
  return(cons)
} 


## Returns the reverse complement of a IUPAC or consensus sequence
revComp <- function(seq){
    
  baseComp <- array(NA,0)
  baseComp[c("A","C","G","T","R","Y","K","M","S","W","B","V","D","H","N","[","]","(",")")] <- c("T","G","C","A","Y","R","M","K","S","W","V","B","H","D","N","]","[",")","(")
  
  return(paste(rev(sapply(strsplit(toupper(seq),"")[[1]],function(x){
    if(!is.na(baseComp[x]))
      {
        return(baseComp[x])
      }else{
        return(x)
      }
  },USE.NAMES = FALSE)),collapse = ""))
}

## Trims leading and trailing white space from character strings.
trimWhiteSpace <- function(x){
    sub("[ \t\n\r]*$", "", sub("^[ \t\n\r]*", "", x))
}

## Removes all letters 'N' at the beginning and end of a string
trimNs <- function(consensus){
  return(sub("N*$", "", sub("^N*", "", consensus)))
}

## Adds n 'N' letters at the beginning and end of a string
addNs <- function(consensus,n){
  consensus <- paste(paste(rep("N",n),collapse=""),consensus,paste(rep("N",n),collapse=""),sep="")
  return(consensus)
}

## Reads sequences from a FASTA file and checks the file format.
seqFromFile <- function(fafile){

  ## Read sequences using Biostings package
  seqs <- try(readDNAStringSet(fafile), silent=TRUE)
  
  ## Report error
  if(class(seqs) == "try-error"){
    stop(paste("Error reading fasta file:",fafile))
  }
  
  seqs <- as.character(seqs)

  return(seqs)
}


## Creates a position weight matrix (PWM) given sequences and a
## IUPAC consensus sequence. 
createPWM <- function(seqs, consensus, matchRevComp=TRUE){

  ## Create a position weight matrix as used by seqLogo
  createPWMhelp <- function(match, seqs, motifLength, isRevComp=FALSE){

    ## Detect all matching DNA sequences
    
    allTBS <- matrix(0, nrow=4,ncol=motifLength, dimnames=list(c("A","C","G","T"),1:motifLength))
    
    for(i in 1:length(match)){
      currentMatch <- match[[i]]
      TBS <- NULL

      if(currentMatch[1] > -1){
        seq <- seqs[i]
        for(j in 1:length(currentMatch)){
          posInRegion <- as.integer(currentMatch[j])
          matchLength <- attr(currentMatch,"match.length")[j]-1

          TBS <- as.character(substr(seq,posInRegion,posInRegion+matchLength))

          if(isRevComp==TRUE){
            TBS <- revComp(TBS)
          }
        }
        TBSasPWM <- sapply(strsplit(TBS,"")[[1]], function(x){
          if(x=="A"){
            return(c(1,0,0,0))
          }
          if(x=="C"){
            return(c(0,1,0,0))
          }
          if(x=="G"){
            return(c(0,0,1,0))
          }
          if(x=="T"){
            return(c(0,0,0,1))
          }
        })
        allTBS <- allTBS+TBSasPWM
      }
    }

    return(allTBS)
  }
  
  motifLength <- nchar(consensus)
    
  motifSequence <- iupacToCons(consensus)
   
  match <- gregexpr(motifSequence,seqs)
  pwm <- createPWMhelp(match, seqs, motifLength)
  
  if(matchRevComp == TRUE){
    RCmotifSequence <- revComp(motifSequence)
    matchRC <- gregexpr(RCmotifSequence,seqs)
    pwm_rc <- createPWMhelp(matchRC, seqs, motifLength,isRevComp=TRUE)
    pwm <- pwm+pwm_rc
  }

  return(pwm)

}


## Given a fasta file and a motif in IUPAC coding, the function makes a
## call to a C function that detects all occurences of the motif in the
## sequences. Number of sequences and the sequence lengths are required
## as arguments so that the C function can allocate enough memory.
matchConsensus <- function(seqs, motif, nrSeqs, seqLengths, maxOccurences=2){
  RCmotif <- revComp(motif)
  motifLength <- nchar(motif)
  RCmotif <- revComp(motif)
  
  hits <- .C("match_consensus", as.character(seqs), as.character(motif), as.character(RCmotif), as.integer(motifLength),  as.integer(nrSeqs), as.integer(seqLengths), as.integer(maxOccurences), result=as.integer(rep(0,nrSeqs)))$result
  
  return(hits)
}


## Given a fasta file and a motif in IUPAC coding, the function detects all
## occurences of the motif for all consensus sequences in the neighbourhood
## of the given motif.
hitsForNeighboursInC <- function(seqs, consensus, nrSeqs, seqLengths, silent){

  hitsForNeighboursByBase <- function(seqs, consensus, basePos, nrSeqs, seqLengths){

    ## Swap base at basePos to newBase
    swapBase<-function(consensus, basePos, newBase){
      consensusByPos <- strsplit(consensus,"")[[1]]
      
      prefix <- substr(consensus, 1, basePos-1)
      suffix <- substr(consensus, basePos+1, length(consensusByPos))
      
      return(paste(prefix,newBase,suffix,sep=""))
    }
    
    consensusByPos <- strsplit(consensus,"")[[1]]
    oldBase <-  substr(consensus, basePos, basePos)
    
    ## Consensus where base at position has been changed
    ## to A,C,G or T
    swappedA <- swapBase(consensus,basePos,"A")
    swappedC <- swapBase(consensus,basePos,"C")
    swappedG  <- swapBase(consensus,basePos,"G")
    swappedT <- swapBase(consensus,basePos,"T")
    
    basicHits <- list()
    basicHits[[swappedA]] <- matchConsensus(seqs, motif=swappedA, nrSeqs=nrSeqs, seqLengths=seqLengths)
    basicHits[[swappedC]] <- matchConsensus(seqs, motif=swappedC, nrSeqs=nrSeqs, seqLengths=seqLengths)
    basicHits[[swappedG]] <- matchConsensus(seqs, motif=swappedG, nrSeqs=nrSeqs, seqLengths=seqLengths)
    basicHits[[swappedT]] <- matchConsensus(seqs, motif=swappedT, nrSeqs=nrSeqs, seqLengths=seqLengths)
    
    hits <- list()
    
    newBases <- iupac[!(iupac %in% oldBase)]
    
    for(base in newBases){
      newCons <- swapBase(consensus, basePos, base)
      atomicBases <- iupacAsList[[base]]
      tmpHits <- rep(0,nrSeqs)
      for(baseTmp in atomicBases){
        consTmp <- swapBase(consensus, basePos, baseTmp)
        tmpHits <- tmpHits+basicHits[[consTmp]]
      }
      hits[[newCons]] <- tmpHits
    }
    
    return(hits)
  }

  consLength <- nchar(consensus)
  hits <- list()

  if(!silent){
    cat("Scanning sequences")
    flush.console()
  }
  
  for(basePos in 1:consLength){
    if(!silent){
      cat(".")
      flush.console()
    }
    hits <- append(hits,hitsForNeighboursByBase(seqs,consensus,basePos,nrSeqs,seqLengths))
  }

  if(!silent){
    cat("\n")
    flush.console()
  }
  
  return(hits)
}


#################### Neighbourhood #######################################

## Returns all consensus sequences in the neighbourhood of a given consensus
getBCRANKNeighbours <- function(consensus){
    
    consensusByPos <- strsplit(consensus,"")[[1]]
    
    neighbours <- array(NA)
    
    for(i in 1:length(consensusByPos)){
      prefix <- substr(consensus, 1, i-1)
      suffix <- substr(consensus, i+1, length(consensusByPos))
      
    newBases <- iupac[!(iupac %in% consensusByPos[i])]
      neighbours <- c(neighbours,paste(prefix,newBases,suffix,sep=""))
    }
  
    return(neighbours[which(!is.na(neighbours))])
  }

######################## Cost function ####################################

## Computes the BCRANK score of a consensus sequence
getBCRANKScore<-function(hitsVec, randOrder, consensus, use.P1, use.P2, nrPoints=25){
  
  ## Given the observed hits and the random orderings, the function returns
  ## the differences between the observed hits and each of the random
  ## orderings. Each difference is computed by comparing the cumulative
  ## hit functions at a number of uniformly distributed points in the interval.
  getDeviationsFromRandom <- function(hits, randOrder, nrPoints){

    points <- round(seq(1,length(hits),length.out=nrPoints))
    
    randHits <- sapply(1:ncol(randOrder), function(i){
      cumsum(hits[randOrder[,i]])[points]
    })

    obsHits <- cumsum(hits)[points]
    dists <- (obsHits - randHits)
        
    return(colSums(dists))
  }

  ## ###############
  ##
  ## Compute score
  ##
  ## ###############
  
  hitsLen <- length(hitsVec)
  hitsUnique <- rep(0,hitsLen)
  matchingIds <- which(hitsVec>0)
  nrHits <- length(matchingIds)

  ## Return negative score if motif is matching in none or all sequences
  if(nrHits == hitsLen || nrHits == 0){
    return(-100)
  }

  ## Make the hits vector conatin only 0's and 1's
  hitsUnique[matchingIds] <- 1
  hitsUnique <- hitsUnique/sum(hitsUnique)

  ## Compute significance score for the hits vector, by comparing to scores
  ## for random orderings using a t-test. 
  randDists <- getDeviationsFromRandom(hitsUnique, randOrder, nrPoints=nrPoints)
  tval <- abs(t.test(randDists,mu=0)$statistic)

  score <- tval
  
  ## #######################
  ##
  ## Compute score penalties.
  ##
  ## #######################
  
  ## Compute P1 - Penalty on non-specific bases

  if(use.P1){
    
    tmpConsensus <- trimNs(consensus)

    tmpConsensus <- gsub("A","X",tmpConsensus)
    tmpConsensus <- gsub("C","X",tmpConsensus)
    tmpConsensus <- gsub("G","X",tmpConsensus)
    tmpConsensus <- gsub("T","X",tmpConsensus)
    
    nrSpecific <- length(which(gregexpr("X",tmpConsensus)[[1]]!=-1))

    ## Avoid zero penalty for motifs with no A,C,G or T bases
    if(nrSpecific == 0){
      nrSpecific <- 0.5
    }

    P1 <-  nrSpecific/nchar(tmpConsensus)

    score <- score*P1

  }
    
  ## Compute P2 - Penalty on repetitive motifs
  if(use.P2){

    nrMultipleHits <- length(which(hitsVec>1))

    P2 <- 1-(nrMultipleHits/nrHits)

    score <- score*P2
  }
  
  return(score)
}


############################ Run search algorithm ###################################

## Runs the BCRANK search on a fasta file containing ranked DNA regions starting from
## an initial consensus sequence. The nrRandom parameter specifies the number of random
## re-orderings performed to calculate scores. Returns a BCRANKsearch object
bcrankRun <- function(seqs, start, nrRandom=500, silent=FALSE, makePlot=FALSE, do.search=TRUE, use.P1=TRUE, use.P2=TRUE){
    
  nrSeqs <- length(seqs)
  seqLengths <- as.numeric(unlist(lapply(seqs,nchar)))

  if(!silent){
    if(do.search){
      cat("\n**** Running BCRANK on",nrSeqs,"regions, starting from",start,"****\n")
    }
    else{
      cat("\n**** Computing BCRANK score for",start,"on",nrSeqs,"regions ****\n")
    }
    flush.console()
  }
  
  ## A list BCRANKmatch objects
  searchPath <- list()
    
  iteration <- 0 

  randOrder <-  sapply(1:nrRandom, function(x){
    sample(1:length(seqs))
  })

  ## A list with the consensus sequences that have alredy scores assigned to them.
  seenMotifs <- list()
  
  hits <- matchConsensus(seqs, motif=start, nrSeqs=nrSeqs, seqLengths=seqLengths)
  score <- getBCRANKScore(hits,randOrder,start, use.P1, use.P2)

  seenMotifs[[start]] <- score
    
  improvement <- TRUE
  iteration <- 0
  bestScore <- score
  bestCons <- start
  bestHits <- hits

  while(improvement){
    
    improvement <- FALSE
    iteration <- iteration+1
    
    searchPath[[iteration]] <- BCRANKmatch(as.character(bestCons),as.numeric(bestScore),as.numeric(bestHits))
    
    if(makePlot){
      ## Create a temporary BCRANKsearch object and plot it
      tmp_final <- searchPath[[iteration]]
      tmp_finalPWM <- matrix(NA)
      tmp_finalNrMatch <- 0
      tmp_searchObj <- BCRANKsearch(tmp_final, tmp_finalPWM, tmp_finalNrMatch, searchPath)
      plot(tmp_searchObj) 
    }

    if(do.search){
      if(!silent){
        cat("\nIteration ",iteration," - ",bestCons,": ",bestScore,"\n",sep="")
        flush.console()
      }
      
      ## Add flanking N's to the current best consensus
      bestCons <- addNs(bestCons,n=1)
      
      ## Generate neighbourhood around current best consensus
      neighbours <- getBCRANKNeighbours(bestCons)
      
      hitsNeigh <- hitsForNeighboursInC(seqs, bestCons, nrSeqs=nrSeqs, seqLengths=seqLengths, silent=silent)
      
      nrNeigh <- length(hitsNeigh)
      nrDone <- 0

      if(!silent){
        cat("Computing scores")
        flush.console()
      }
      
      for(consensus in names(hitsNeigh)){
        hits <- hitsNeigh[[consensus]]
        score <- NULL
        if(consensus %in% names(seenMotifs)){
          score <- seenMotifs[[consensus]]
        }
        else{
          score <- getBCRANKScore(hits,randOrder,consensus,use.P1,use.P2)
          seenMotifs[[consensus]] <- score
        }
        
        if(!is.nan(score)){
          if(score > bestScore){
            bestScore <- score
            bestCons <- trimNs(consensus)
            bestHits <- hits
            improvement <- TRUE
          }
        }
        
        nrDone <- nrDone+1
        
        if(!silent){
          ## Don't report all computed scores
          if((nrDone %% 10) == 0){
            cat(".")
            flush.console()
          }
        }
      }
      if(!silent){
        cat("\n")
        flush.console()
      }
    }
    ## Don't run the search
    else{
      improvement <- FALSE
    }
  }

  ## Create BCRANKmatch object for current best consensus and store in list
  final <- searchPath[[iteration]]
  finalPWM <- createPWM(seqs,consensus(final))
  finalNrMatch <- unique(colSums(finalPWM))
  
  ## Returns a BCRANKsearch object
  searchResult <- BCRANKsearch(final, finalPWM, finalNrMatch, searchPath)

  return(searchResult)
}


## Runs the BCRANK algorithm.
bcrank <- function(fafile, startguesses=c(), restarts=10, length=10, reorderings=500, silent=FALSE, plot.progress=FALSE, do.search=TRUE, use.P1=FALSE, use.P2=TRUE, strip.desc=TRUE){
    
  if (!identical(strip.desc, TRUE))
      warning("'strip.desc' argument is ignored")

  ## A list containing BCRANKsearch objects
  BCRANKsearchResults <- list()
  
  ## Read sequences from file
  seqs <- seqFromFile(fafile)
    
  ## Generates a random consensus sequence in IUPAC encoding.
  getRandomInit <- function(length=10){
    return(paste(sample(iupac,length,replace=TRUE),collapse=""))
  }
  
  ## Multiple random search
  if(length(startguesses)==0){
    for(i in 1:restarts){
      BCRANKsearchResults[[i]] <- bcrankRun(seqs, nrRandom=reorderings, start=getRandomInit(length),silent=silent, makePlot=plot.progress, do.search=do.search, use.P1=use.P1, use.P2=use.P2)
    }
  }

  ## Search from given startguesses (coded in IUPAC)
  else{
    restarts <- length(startguesses)
    for(i in 1:restarts){
      BCRANKsearchResults[[i]] <- bcrankRun(seqs, nrRandom=reorderings, start=startguesses[i], silent=silent, makePlot=plot.progress, do.search=do.search, use.P1=use.P1, use.P2=use.P2)
    }
  }

  ## Return BCRANKresult object
  funCallString <- as.character(sys.call())
  result <- BCRANKresult(BCRANKsearchResults, length(seqs), fafile, funCallString, restarts)
  
  return(result)
}



################### Report macthing sites in a fasta file ############################
matchingSites <- function(fafile, motifSequence, revComp=TRUE, strip.desc=TRUE){
  
  if (!identical(strip.desc, TRUE))
      warning("'strip.desc' argument is ignored")

  reportMatch <- function(match, seqs, strand){

    siteInfo <- NULL
    
    for(i in 1:length(match)){
      currentMatch <- match[[i]]
      
      if(currentMatch[1] > -1){
        seqNr <- i
        header <- names(seqs)[i]

        seq <- seqs[i]
        
        for(j in 1:length(currentMatch)){
          posStart <- as.integer(currentMatch[j])
          matchLength <- attr(currentMatch,"match.length")[j]-1
          posEnd <- as.integer(posStart+matchLength)

          DNAseq <- as.character(substr(seq,posStart,posEnd))
          
          siteInfo <- rbind(siteInfo, c(header, seqNr, posStart, posEnd, strand, DNAseq))
          
        }
      }
    }
    return(siteInfo)
  }
  
  seqs <- seqFromFile(fafile)
    
  motifSequence <- iupacToCons(motifSequence)
  tmpMatch <- gregexpr(motifSequence,seqs,perl=TRUE)
  match <- reportMatch(tmpMatch, seqs, strand="+")
  
  if(revComp){
    RCmotifSequence <- revComp(motifSequence)
    tmpMatchRC <- gregexpr(RCmotifSequence,seqs,perl=TRUE)
    rcMatch <- reportMatch(tmpMatchRC, seqs, strand="-")

    match <- rbind(match,rcMatch)
  }

  match <- match[order(as.integer(match[,2])),]
  
  match[,2] <- paste("seq",match[,2],sep="")

  match <- as.data.frame(match)
  
  colnames(match) <- c("Region header","Region nr","Start","End","Strand","Sequence")

  return(match)

}

Try the BCRANK package in your browser

Any scripts or data that you put into this service are public.

BCRANK documentation built on Nov. 8, 2020, 8:21 p.m.