R/pwm.model.R
In motifRG: A package for discriminative motif discovery, designed for high throughput sequencing dataset

Documented in refinePWMMotif refinePWMMotifExtend

library(IRanges)
library(Biostrings)
logPWM <- function(pwm,  null=rep(0.25, 4))
{
  return(log2(pwm/null))
}

seqToMatrix <- function(seqs, encode= NULL)
  {
    if(!is.null(encode)){    
      width <- nchar(seqs[1])
      seq <- paste(seqs, collapse="")      
      seq.num <- as.integer(toComplex(DNAString(seq), encode))
      matrix(seq.num, nrow=length(seqs), ncol=width, byrow=T)        
    }
    else{
      as.matrix(seqs)
    }
  }

getPWM <- function(match, weights= NULL, pseudo=10, alphabet=DNA_BASES, null=rep(1/length(alphabet), length(alphabet)))
  {
    if(is.null(weights)){
      pwm <-  (consensusMatrix(match) + pseudo)[alphabet,]
      pwm <- t(t(pwm)/colSums(pwm))
    }
    else{
      mat <- seqToMatrix(DNAStringSet(match))
      pwm <- apply(mat, 2, function(x){
        tmp<- tapply(weights, x, sum)
        counts <- rep(0, length(alphabet))
        names(counts) <- alphabet
        tmp <- tmp[names(tmp)%in% alphabet]
        counts[names(tmp)] <- tmp
        counts
      })
      pwm <- t(t(pwm+ pseudo) / colSums(pwm+pseudo))
    }
    
    log.pwm <- logPWM(pwm, null)
    return(list(prob=pwm, logodd=log.pwm, bg=null))
  }

bestPWMMatch <- function(pwm, seqs, both.strand=T, flank=0) #pwm in logodd
  {
    seqs.view <- pasteSeq(seqs)
    seq <- subject(seqs.view)
    width <- ncol(pwm)
    len <- length(seq)
    tmp <- PWMscoreStartingAt(pwm, seq, 1:(len-width+1))
    strand <- rep(T, length(seqs))
    if(both.strand){
      seq.rev <- reverseComplement(seq)
      tmp2 <- rev(PWMscoreStartingAt(pwm, seq.rev, 1:(len-width+1)))
      tmp <- pmax(tmp, tmp2)    
      strand <- tmp > tmp2
    }
    tmp.view <- Views(Rle(tmp), start(seqs.view), end(seqs.view) - width+1)
    match.pos <- viewWhichMaxs(tmp.view)
    match <- DNAStringSet(Views(seq, match.pos - flank, match.pos+width-1 + flank))
    if(sum(!strand) > 0){     
      match.rev <- reverseComplement(match[!strand[match.pos]])
      match <- as.character(match)
      match[!strand[match.pos]] <- as.character(match.rev)
    }
    else{
      match <- as.character(match)
    }
    return(data.frame(pattern=match, score= tmp[match.pos],strand=strand[match.pos],pos = match.pos - start(seqs.view), stringsAsFactors=F))           
  }                      


scanPWMMatch <- function(pwm, seqs,  both.strand=T, flank=0,...) ##logscale
  {
    seqs.view <- pasteSeq(seqs)
    seq <- subject(seqs.view)
    width <- ncol(pwm)
    len <- length(seq)   
    match <- matchPWM(pwm, seq, ...)
    match <- IRanges(start(match), end(match))
    strand <- rep(T, length(match))
    if(both.strand){
      match.rev <- matchPWM(reverseComplement(pwm), seq, ...)
      match.rev <- IRanges(start(match.rev), end(match.rev))
      strand <- c(strand, rep(F, length(match.rev)))
      match <- c(match, match.rev)
    }    
    ## dup <- duplicated(start(match))
    ## match <- match[!dup]
    ## strand <- strand[!dup]    
    tmp <- Views(seq, start(match) - flank, end(match) + flank)   
    pattern <- as.character(tmp)
    pattern[!strand] <- as.character(reverseComplement(tmp[!strand]))
    seq.id <- findOverlaps(match, seqs.view, select="first")
    match.score <- bestPWMMatch(pwm, pattern, both.strand=F)$score
    df <- data.frame(pattern=pattern, score=match.score, seq.id=seq.id, strand=strand, pos=start(match) - start(seqs.view)[seq.id], stringsAsFactors=F)
    row.names(df) <- NULL
    select <- tapply(1:nrow(df), start(match), function(x){x[which.max(df$score[x])]})
    df <- df[select,]
    ord <- order(df$seq.id, df$pos)
    df[ord,]
  }

              

split.pwm <- function(matches,null=rep(0.25,4), min.split=0.5, min.size=100)
  {
    all.splits <- to.split <- list(all=1:length(matches))
    all.pwms <- list(all=getPWM(matches)$prob)
    while(length(to.split) > 0){      
      select <- to.split[[1]]
      name <- names(to.split)[[1]]
      pwm <- all.pwms[[name]]
      e <- sum(apply(pwm, 2, relativeEntropy, null=null))
      to.split <- to.split[-1]      
      to.split.scores <- sapply(1:ncol(pwm), function(i){
        sapply(names(IUPAC), function(a){
          tmp <- substring(matches[select], i,i) %in% IUPAC[[a]]
          total <- sum(tmp)
          if(total > min.size & total < length(select)- min.size){
            pwm1 <- getPWM(matches[select][tmp])$prob
            pwm2 <- getPWM(matches[select][!tmp])$prob
            w <- sum(tmp)/length(tmp)
            (sum(w*apply(pwm1, 2, relativeEntropy, null=null) +
                (1-w)*apply(pwm2, 2, relativeEntropy, null=null)) - e) * length(select)/length(matches)
          }
          else{
            0
          }
        })})
      if(max(to.split.scores) > min.split){
        a <- which.max(apply(to.split.scores, 1, max))
        i <- which.max(apply(to.split.scores, 2, max))
        desc <- paste(name, names(a), i)
        cat(desc, "\n")
        tmp <- substring(matches[select], i,i) %in% IUPAC[[a]]
        to.split <- c(list(select[tmp], select[!tmp]), to.split)
        names(to.split)[1:2] <- paste(desc, c(T,F))
        all.pwms <- c(list(getPWM(matches[select][tmp])$prob, getPWM(matches[select][!tmp])$prob), all.pwms)
        names(all.pwms)[1:2] <- paste(desc, c(T,F))
        all.splits <- c(to.split[1:2], all.splits)
      }
    }
    return(list(splits=all.splits, pwms = all.pwms))
  }



matchPWMCoor <- function(pwm, seqs, coor,  both.strand=T,...)
{
  seqs <- pasteSeq(seqs)
  seq <- subject(seqs)    
  match <- matchPWM(pwm, seq, ...)
  match.df <- data.frame(strand=rep("+", length(match)), pattern = as.character(match),stringsAsFactors=F)
  if(both.strand){    
    pwm.rev <- reverseComplement(pwm)
    match.rev <- matchPWM(pwm.rev, seq,...)
    match.df <- rbind(match.df, data.frame(strand=rep("-", length(match.rev)),
                                           pattern = as.character(reverseComplement(match.rev)), stringsAsFactors=F))
    match <- c(as(match,"IRanges"), as(match.rev, "IRanges"))                     
  }
  ord <- order(start(match))
  dup <- duplicated(start(match))
  ord <- ord[!dup[ord]]
  match<- match[ord]
  match.df <- match.df[ord,]  
  l <- findOverlaps(match, seqs, select="first")
  r <- IRanges(start(match) - start(seqs)[l] + start(coor)[l],end(match) - start(seqs)[l] + start(coor)[l])
  seqnames <- seqnames(coor)[l]
  score <- bestPWMMatch(pwm, match.df$pattern, both.strand=F)$score
  GRanges(r, seqnames=seqnames, strand=match.df$strand, pattern=match.df$pattern, score=score, seq.id=l)
}


bestPWMMatchCoor <- function(pwm, seqs, coor, both.strand=T, flank=0)
  {
    w <- ncol(pwm)
    match <- bestPWMMatch(pwm, seqs, both.strand=both.strand, flank=flank)
    seq.strand <- rep("+", length(seqs))
    strand <- rep("+", length(seqs))
    if("strand" %in% colnames(coor)){
      seq.strand <- coor$strand
    }
    start <- start(coor) + match$pos
    end <- start + w  - 1
    strand <- rep("+", nrow(match))
    
    select <- seq.strand== "-"
    end[select] <- (end(coor) - match$pos)[select]
    start[select] <- end[select] - w + 1
    
    select <- xor(seq.strand == "+",  match$strand)
    strand[select] <- "-"
    rd <- RangedData(IRanges(start, end), strand=strand, seq.strand=seq.strand, match.strand=match$strand, pattern=match$pattern,
                     space=space(coor), score=match$score, pos=match$pos)
    rd[names(coor)]
  }




refinePWMMotif <- function(motifs=NULL, seqs, pwm.ld=NULL,  max.iter=50, tol=10^-4, mod="oops", null=rep(0.25, 4),pseudo=1, weights=rep(1, length(seqs)), motif.weights=NULL)
  {
    
    total.score <- -Inf
    diff <- Inf
    iter = 1    
    if(mod=="zoops"){
      gamma0 <- gamma <- 0.5
    }
    if(is.null(pwm.ld)){
      patterns <- motifs
      pwm.ld <- (getPWM(patterns, weights=motif.weights, null=null,pseudo=pseudo))$logodd
    }
    repeat{
      if(mod=="oops"){
        match<- bestPWMMatch(pwm.ld, seqs)
        match$weights = weights
      }
      else{
        match <- scanPWMMatch(pwm.ld, seqs, min.score=1)
        ratio <- 2^match$score
        tmp <- tapply(ratio, match$seq.id, sum)
        seq.sum <- rep(0, nrow(match))
        seq.sum[as.numeric(names(tmp))] <- tmp
        if(mod=="zoops"){
          lambda <- gamma/ width(seqs)[match$seq.id]          
          match$weights <- ratio * lambda/ (1 - gamma + seq.sum[match$seq.id]*lambda) * weights[match$seq.id]
          gamma <- (sum(match$weights) + gamma0)/(sum(weights)+1)
          #cat("gamma", gamma, "\n")          
        }
      }      
      patterns <- match$pattern
      new.score <- sum(match$score * match$weights)
      pwm <- (getPWM(patterns,weights=match$weights, null=null,pseudo=pseudo))
      pwm.ld <- pwm$logodd     
      cat(iter, new.score, "\n")
      iter = iter +1
      if((new.score - total.score)/abs(new.score) < tol || iter > max.iter ){
        break
      }
      total.score = new.score
    }
    return(list(model=pwm, match=match, score=total.score))
  }



refinePWMMotifExtend <- function(motifs=NULL, seqs, pwm.ld=NULL, flank=3, extend.tol=10^-3, trim.rel.entropy=0.2,  null=rep(0.25, 4), max.width=20, ...)
  {
    prev.score <- 0    
    repeat{
      result <- refinePWMMotif(motifs=motifs, seqs=seqs, pwm.ld=pwm.ld, null=null, ...)
      pwm <- result$model
      width <- ncol(pwm$prob)
      ###trim on entropy###
      e <- apply(pwm$prob, 2, relativeEntropy, null=null)
      trim.left <- 1
      trim.right <- width      
      if(!any(e > trim.rel.entropy)) {break}      
      tmp <- which(e > trim.rel.entropy)
      trim.left <- min(tmp)
      trim.right <- max(tmp)        
      if(trim.left > flank & width - trim.right > flank){ ## no information in the flanking region, no need to extend
        break
      }
      if(trim.right - trim.left > max.width){
        break
      }
      match <- bestPWMMatch(pwm$logodd[,trim.left:trim.right], seqs, flank=flank)      
      motifs <- match$pattern
      new.score <- result$score
      cat("Extend score", new.score, prev.score, "\n")
      if((new.score - prev.score)/abs(new.score) < extend.tol){break}
      prev.score = new.score
      pwm.ld <- NULL
    }
    return(result)
  }

Any scripts or data that you put into this service are public.

motifRG documentation built on April 28, 2020, 8:46 p.m.

rdrr.io home R language documentation Run R code online

CRAN packages Bioconductor packages R-Forge packages GitHub packages

Note that we can't provide technical support on individual packages. You should contact the package authors for that.

motifRG
A package for discriminative motif discovery, designed for high throughput sequencing dataset

R/pwm.model.R
In motifRG: A package for discriminative motif discovery, designed for high throughput sequencing dataset

Defines functions refinePWMMotifExtend refinePWMMotif bestPWMMatchCoor matchPWMCoor split.pwm scanPWMMatch bestPWMMatch getPWM seqToMatrix logPWM

Documented in refinePWMMotif refinePWMMotifExtend

Try the motifRG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

motifRG A package for discriminative motif discovery, designed for high throughput sequencing dataset

R/pwm.model.R In motifRG: A package for discriminative motif discovery, designed for high throughput sequencing dataset

Defines functions refinePWMMotifExtend refinePWMMotif bestPWMMatchCoor matchPWMCoor split.pwm scanPWMMatch bestPWMMatch getPWM seqToMatrix logPWM

Documented in refinePWMMotif refinePWMMotifExtend

Try the motifRG package in your browser

R Package Documentation

Browse R Packages

We want your feedback!

motifRG
A package for discriminative motif discovery, designed for high throughput sequencing dataset

R/pwm.model.R
In motifRG: A package for discriminative motif discovery, designed for high throughput sequencing dataset