R/peplib.R

Defines functions sdist sHammingDist dist.Sequences wdist explode processError read.sequences read.fasta write.sequences write.fasta plot.Sequences createSWeights motifModelSet motifModel predict.MotifModel predict.MotifModelSet classify.MotifModelSet EM.SSOOPS.Linked EMStep.SSOOPS EMStep.OOPS EMStep.ZOOPS logLik.OOPS logLik.SSOOPS logLik.ZOOPS BIC.MotifModel BIC.MotifModelSet logLik.MotifModelSet factory.OOPS factory.SSOOPS factory.ZOOPS print.Sequences print.MotifModel MotifModel.motifString plot.MotifModelSet MotifModel.plotStartingPosition MotifModel.plotPositions MotifModel.plotFits plot.MotifModel aclust changeClusterFormat findMinDistElement simpleDescriptors descriptors autocorrelation factory.descriptor empty.df empty.matrix plot.Descriptors rbind.Sequences decoys residuals.MotifModel residuals.MotifModelSet FiveTwoCV.Sequences FiveTwoCV.Descriptors FiveTwoCV.default modelStep ROC

Documented in aclust changeClusterFormat descriptors motifModel motifModelSet read.fasta read.sequences simpleDescriptors write.fasta write.sequences

#SUPER IMPORTANT NOTE: These methods are fairly general, but they assume the last character in the alphabet is a gap character.
setClass("Sequences", representation(alphabet="character", nseqs="numeric"), contains="matrix")
setClass("Descriptors", representation(response="numeric", pvalues="numeric"), contains="data.frame")
setClass("MetricParams", representation(smatrix="matrix", gapOpen="numeric", gapExtension="numeric"))
setClass("MotifModel", representation(mmodel="matrix", bmodel="numeric", width="integer", seqs="Sequences", np="integer"))
setClass("MotifModelSet", representation(motifs="list"))
setClass("OOPS", representation(zmatrix="matrix"), contains="MotifModel")
setClass("SSOOPS", representation(zvector="numeric"), contains="MotifModel")
setClass("ZOOPS", representation(zmatrix="matrix", gamma="numeric", qarray="numeric"), contains="MotifModel")

bs85 <- matrix(c(5, -2, -2, -2, -1, -1, -1,  0, -2, -2, -2, -1, -2, -3, -1,  1,  0, -3, -3, -1, -2, -1, -1, -6, 
-2,  6, -1, -2, -4,  1, -1, -3,  0, -4, -3,  2, -2, -4, -2, -1, -2, -4, -3, -3, -2,  0, -2, -6, 
-2, -1,  7,  1, -4,  0, -1, -1,  0, -4, -4,  0, -3, -4, -3,  0,  0, -5, -3, -4,  4, -1, -2, -6, 
-2, -2,  1,  7, -5, -1,  1, -2, -2, -5, -5, -1, -4, -4, -2, -1, -2, -6, -4, -4,  4,  1, -2, -6, 
-1, -4, -4, -5,  9, -4, -5, -4, -5, -2, -2, -4, -2, -3, -4, -2, -2, -4, -3, -1, -4, -5, -3, -6, 
-1,  1,  0, -1, -4,  6,  2, -3,  1, -4, -3,  1,  0, -4, -2, -1, -1, -3, -2, -3, -1,  4, -1, -6, 
-1, -1, -1,  1, -5,  2,  6, -3, -1, -4, -4,  0, -3, -4, -2, -1, -1, -4, -4, -3,  0,  4, -1, -6, 
 0, -3, -1, -2, -4, -3, -3,  6, -3, -5, -5, -2, -4, -4, -3, -1, -2, -4, -5, -4, -1, -3, -2, -6, 
-2,  0,  0, -2, -5,  1, -1, -3,  8, -4, -3, -1, -3, -2, -3, -1, -2, -3,  2, -4, -1,  0, -2, -6, 
-2, -4, -4, -5, -2, -4, -4, -5, -4,  5,  1, -3,  1, -1, -4, -3, -1, -3, -2,  3, -5, -4, -2, -6, 
-2, -3, -4, -5, -2, -3, -4, -5, -3,  1,  4, -3,  2,  0, -4, -3, -2, -3, -2,  0, -5, -4, -2, -6, 
-1,  2,  0, -1, -4,  1,  0, -2, -1, -3, -3,  6, -2, -4, -2, -1, -1, -5, -3, -3, -1,  1, -1, -6, 
-2, -2, -3, -4, -2,  0, -3, -4, -3,  1,  2, -2,  7, -1, -3, -2, -1, -2, -2,  0, -4, -2, -1, -6, 
-3, -4, -4, -4, -3, -4, -4, -4, -2, -1,  0, -4, -1,  7, -4, -3, -3,  0,  3, -1, -4, -4, -2, -6, 
-1, -2, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -3, -4,  8, -1, -2, -5, -4, -3, -3, -2, -2, -6, 
 1, -1,  0, -1, -2, -1, -1, -1, -1, -3, -3, -1, -2, -3, -1,  5,  1, -4, -2, -2,  0, -1, -1, -6, 
 0, -2,  0, -2, -2, -1, -1, -2, -2, -1, -2, -1, -1, -3, -2,  1,  5, -4, -2,  0, -1, -1, -1, -6, 
-3, -4, -5, -6, -4, -3, -4, -4, -3, -3, -3, -5, -2,  0, -5, -4, -4, 11,  2, -3, -5, -4, -3, -6, 
-3, -3, -3, -4, -3, -2, -4, -5,  2, -2, -2, -3, -2,  3, -4, -2, -2,  2,  7, -2, -4, -3, -2, -6, 
-1, -3, -4, -4, -1, -3, -3, -4, -4,  3,  0, -3,  0, -1, -3, -2,  0, -3, -2,  5, -4, -3, -1, -6, 
-2, -2,  4,  4, -4, -1,  0, -1, -1, -5, -5, -1, -4, -4, -3,  0, -1, -5, -4, -4,  4,  0, -2, -6, 
-1,  0, -1,  1, -5,  4,  4, -3,  0, -4, -4,  1, -2, -4, -2, -1, -1, -4, -3, -3,  0,  4, -1, -6, 
-1, -2, -2, -2, -3, -1, -1, -2, -2, -2, -2, -1, -1, -2, -2, -1, -1, -3, -2, -1, -2, -1, -2, -6, 
-6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6,  1),
    nrow=24, ncol=24)

aabet <- c("A",  "R",  "N",  "D",  "C",  "Q",  "E",  "G",  "H",  "I",  "L",  "K",  "M",  "F",  "P",  "S",  "T",  "W",  "Y",  "V",  "B",  "Z",  "X",  "-")
colnames(bs85) <- aabet
rownames(bs85) <- aabet

default.MetricParams <- new("MetricParams", smatrix=bs85, gapOpen=-10, gapExtension=-0.2)

sdist <- function(s1, s2, params=default.MetricParams) {
  
  distance <- rep(NA, length(s1))

  i1 = "-"
  i2 = "-"
  for(i in 1:length(s1)) {

    #deal with differeing lengths
    if(s1[i] == "-") {
      i1 = "-"
    } else {
      i1 = s1[i]
    }
    if(i > length(s2) || s2[i] == "-") {
      i2 = "-"
    } else {
      i2 = s2[i]
    }

    #gaps
    if(i1 == "-" && i2 != "-") {
      if(i > 1 && s1[i - 1] == "-") {
        distance[i] = params@gapExtension
      }
      else {
        distance[i] = params@gapOpen
      }
    }
    else if(i1 != "-" && i2 == "-") {
      if(i > 1 && s2[i - 1] == "-") {
        distance[i] = params@gapExtension
      }
      else{
        distance[i] = params@gapOpen
      }
    }
    else {
      distance[i] = params@smatrix[i1,i2]
    }
    
  }

  s = sum(distance)
  return(s)

}

sHammingDist <- function(s1, s2, params) {
  distance <- sum(sapply(1:min(length(s1),length(s2)), FUN=function(x) {as.double(s1[x] != s2[x])}))
  distance <- distance + abs(length(s1) - length(s2))
  return(as.double(distance))
}

dist.Sequences <- function(seqs, method="substitution", params=default.MetricParams, ...) {
  seqs <- seqs@.Data
  
  if(method == "substitution" || method=="euclidian") {
    dist <- sdist
  }
  if(method == "hamming") {
    dist <- sHammingDist
  }
  dvec <- vector(mode="numeric", length=nrow(seqs) * (nrow(seqs) - 1) / 2)
  for(i in 1:(nrow(seqs) - 1)){
    for(j in (i + 1):nrow(seqs)) {
      dvec[nrow(seqs) * (i - 1) - i  * (i - 1) / 2 + j - i] <- dist(seqs[i,], seqs[j,], params)
    }
  }
  #Make it a dissimilarity matrix
  dvec <- max(dvec) - dvec

  attr(dvec, "Size") <- nrow(seqs)
  attr(dvec, "Labels") <- rownames(seqs)
  attr(dvec, "Diag") <- FALSE
  attr(dvec, "Upper") <- FALSE
  attr(dvec, "method") <- method
  attr(dvec, "call") <- match.call()
  class(dvec) <- "dist"
    
  return(dvec)
  
}

setGeneric("dist")
setMethod("dist", "Sequences", function(x, method="euclidian", diag=F, upper=F, p=2) dist.Sequences(x, method))
setMethod("[", signature=c("Sequences"), definition=function(x, i, j, ..., drop) {
  if(missing(j)) {
    new("Sequences", x@.Data[i,], alphabet=x@alphabet, nseqs=length(i))
  } else if(missing(i)) {
    new("Sequences", x@.Data[,j], alphabet=x@alphabet, nseqs=0)
  } else {
    new("Sequences", x@.Data[i,j], alphabet=x@alphabet, nseqs=0)
  }
} )


wdist <- function(seqmatrix, sweights=NULL, dist=sdist, params=default.MetricParams) {
  if(is.null(sweights)){
    sweights <- createSWeights(seqmatrix)
  }		      

  dmatrix <- sdist(seqmatrix, dist, params)
  dmatrix <- dmatrix * (sweights %*% t(sweights))
  return(dmatrix)
}

explode <- function(string, max, alphabet) {
  result <- strsplit(string, split="")[[1]]
  while(length(result) < max) {
    result <- c(result,"-")
  }
  result <- sapply(result, FUN=function(x){if(!(x %in% alphabet)) {"B"} else {x}})
  return(result)
}

processError <- function(e, mymessage) {
  cat(paste(mymessage, "\n"))
}

read.sequences <- function(file, header = FALSE, sep = "", quote="\"", dec=".",
                 fill = FALSE, comment.char="", alphabet=aabet) {
  
  #assumes that the first column is the sequence information

  #Try loading the file
  tryCatch(data <- read.table(file, header=header, sep=sep, quote=quote, dec=dec, fill=fill, comment.char=comment.char), error=function(e) {processError(e, "Could not read file")})
  
  #Ok, continue to spread out sequences
  t <- tryCatch(data <- toupper(as.character(data[,1])), error=function(e) {processError(e, "Non-letters in sequence file")})

  if(inherits(t, "try-error")) return
  
  nseqs <- length(unique(data))
  t <- tryCatch(maxlength <- max(sapply(data, FUN=function(x) {length(unlist(strsplit(x, split="")))})), error=function(e) {processError(e, "Failed to split sequence into characters")})

  if(inherits(t, "try-error")) return
  
 t <- tryCatch( seqmatrix <- matrix(sapply(data, FUN=function(x) {explode(x, maxlength, alphabet)}), nrow=length(data), byrow=TRUE), error=function(e) {processError(e, "Failed to process non-cannonical amino acids")})

  if(inherits(t, "try-error")) return

  t <- tryCatch(seqmatrix <- apply(seqmatrix, MARGIN=2, FUN=function(x) {sapply(x, FUN=function(y) {which(alphabet == as.character(y))})}), error=function(e) {processError(e, "Failed to convert to numerical representation")})

  if(inherits(t, "try-error")) return

  rnames <- apply(seqmatrix, MARGIN=1, FUN=function(x) {paste(alphabet[x], collapse="")})
  for(i in 1:(length(rnames) - 1)) {
    if(rnames[i] %in% rnames[(i + 1):length(rnames)]) {
    rnames[i] <- paste(rnames[i], ".", sum(rnames[i] == rnames[(i + 1):length(rnames)]), sep="")
  }
  }

  rownames(seqmatrix) <- rnames


  seqs <- new("Sequences",  seqmatrix, alphabet=alphabet, nseqs=nseqs)

  
  return(seqs)
}

read.fasta <- function(file, header = FALSE, sep = "", quote="\"", dec=".",
                 fill = FALSE, alphabet=aabet) {
  return(read.sequences(file, header, sep, quote, dec, fill, comment.char=">", alphabet))
}


#this will allow the writing of sequences to a file, utilizing R's
#built in write.table method If you pass a motifModel along with the
#sequences, then it will align and output the motifs from the
#sequences.
write.sequences <- function(seqs, motifModel = NULL, file = "", append = FALSE) {

  if(!is.null(motifModel)) {
    
    outSeqs <- matrix(0, ncol=motifModel@width, nrow=nrow(seqs))
    if(class(motifModel) == "SSOOPS") {
      startPos <- which.max(motifModel@zvector)
      
      outSeqs <- seqs[, startPos:(startPos + motifModel@width - 1)]
    } else {
      for(i in 1:nrow(seqs)) {
        startPos <- which.max(motifModel@zmatrix[i, ])
        outSeqs[i,] <- seqs[i, startPos:(startPos + motifModel@width)]
      }
    }
    seqs <- outSeqs
  } 

  
  output <- apply(seqs, MARGIN=1, FUN=function(x) {paste(seqs@alphabet[x], collapse="")})
  
  write.table(output, file=file, append=append, row.names=FALSE, col.names=FALSE)
  
}

write.fasta <- function(seqs, motifModel = NULL, file = "",  eol = "\n") {


  if(!is.null(motifModel)) {
    
    outSeqs <- matrix(0, ncol=motifModel@width, nrow=nrow(seqs))
    if(class(motifModel) == "SSOOPS") {
      startPos <- which.max(motifModel@zvector)
      
      outSeqs <- seqs[, startPos:(startPos + motifModel@width - 1)]
    } else {
      for(i in 1:nrow(seqs)) {
        startPos <- which.max(motifModel@zmatrix[i, ])
        outSeqs[i,] <- seqs[i, startPos:(startPos + motifModel@width)]
      }
    }
    seqs <- outSeqs
  }

  output <- apply(seqs, MARGIN=1, FUN=function(x) {paste(seqs@alphabet[x], collapse="")})

  for(i in 1:length(output)) {
    
    write(paste("> Sequence", i), file=file, append=T)
    write(paste(output[i]), file=file, append=T)
    
  }

}

plot.Sequences <- function(seqs, clusterNumber=3, params=default.MetricParams, distanceMatrix=dist.Sequences(seqs, params=params), clusters=aclust(distanceMatrix, clusterNumber), legendText=c(), main="") {

  #This makes it so that the coloring follows the size of the clusters. Makes plots reproducible since the
  #cluster indices from kmeans are not reproducible.
  idmap <- order(as.integer(lapply(clusters, length)))
    
  colors <- rep("black", nrow(seqs))
  shades <- hcl(h=1:clusterNumber * (360 / clusterNumber), c=90, l=70 )
  for(i in 1:length(clusters)) {
    colors[clusters[[idmap[i]]]] <- shades[i]
  }
  
#  fit <- cmdscale(distanceMatrix, eig=T, k=2)
  fit <- prcomp(distanceMatrix)
  x <- fit$x[,1]
  y <- fit$x[,2]
  plot(x,y,xlab="Component 1", ylab="Component 2", col=colors, main=main)
  if(length(legendText) > 0) {
    legend(max(x), max(y) * 1.25, legendText, col=shades,text.col="black", pch=rep(1, length(legendText)), xpd=TRUE)
  }
}

setGeneric("plot")
setMethod("plot", "Sequences", function(x,y,...) plot.Sequences(x, ...))

createSWeights <- function(seqmatrix, params=default.MetricParams) {

  wmatrix <- matrix(rep(0,nrow(params@smatrix) * ncol(seqmatrix)), nrow=nrow(params@smatrix))
  rownames(wmatrix) <- rownames(params@smatrix)
  tables <- apply(seqmatrix, MARGIN=2, FUN=function(x) table(x))
  for(i in 1:ncol(wmatrix)) {
    for(j in 1:length(tables[[i]])) {
      wmatrix[names(tables[[i]])[j], i] <-  1. / (length(tables[[i]]) * tables[[i]][j])
    }
  }

  sweights <- rep(0,nrow(seqmatrix))
  for(i in 1:nrow(seqmatrix)) {
    for(j in 1:ncol(seqmatrix)) {
      sweights[i] <- sweights[i] + wmatrix[seqmatrix[i,j], j]
    }
  }

  return(sweights)
}

motifModelSet <- function(seqs, motifNumber=NA, type="fixed", width=4, verbose=T, clusterType="kmeans", maxGuess=8, plotMain="")  {

  #can't have a motif wider than the sequences
  if(width > ncol(seqs)) {
    width <- ncol(seqs)
  }

  #check if the number of sequences is accurate
  if(seqs@nseqs == 0) {
    seqs@nseqs <- length(unique(apply(seqs@.Data, MARGIN=1, FUN=function(x) {paste(x,collapse="")})))
  }
  
  if(is.na(motifNumber)) {
    cat("Guessing cluster number, this could take a while...\n")

    #can't have more clusters than unique sequences
    if(maxGuess > seqs@nseqs) {
      maxGuess <- seqs@nseqs
    }
    
    ll <- data.frame(clusterN=1:maxGuess, logLik=rep(0, maxGuess))
    for(i in 1:maxGuess) {
      cat(paste("\r", i, "/", maxGuess))
      ll$logLik[i] <- logLik(motifModelSet(seqs, i, type, width, verbose, clusterType))
    }
    cat("\n")
    plot(ll, xlab="Cluster Number", col="black", main=plotMain)
    lines(ll, col="black", lty=1)
    motifNumber <- ll$clusterN[which.min(ll$logLik)]
  }else if(motifNumber == 1) {
    return(new("MotifModelSet", motifs=list(motifModel(seqs, type, width))))
  }

  if(motifNumber > seqs@nseqs) {
    cat(paste("Warning: must have more unique sequences than motifs. Lowering motif number to", seqs@nseqs))
    motifNumber <- seqs@nseqs
  }

  clusters <- aclust(dist(seqs), clusterNumber=motifNumber, verbose=verbose, type=clusterType)
  seqList <- vector(mode="list", length=motifNumber)
  
  for(i in 1:motifNumber) {
    seqList[[i]] <- new("Sequences", seqs[clusters[[i]],], alphabet=seqs@alphabet)
  }
  
  result <- lapply(seqList, FUN=function(x) { motifModel(x, type, width) })
  mset <- new("MotifModelSet", motifs=result)
  
  return(mset)
}

motifModel <- function(seqs, type="fixed", width=4) {

  if(width > ncol(seqs)) {
    width <- ncol(seqs)
  }

  #check if the number of sequences is accurate
  if(seqs@nseqs == 0) {
    seqs@nseqs <- length(unique(apply(seqs@.Data, MARGIN=1, FUN=function(x) {paste(x,collapse="")})))
  }
  
  if(class(seqs) == "matrix") {

    seqs <- new("Sequences", seqs, alphabet=aabet, nseqs=seqs@nseqs)
    
  }
  
  if(type == "fixed") {
    model <- factory.SSOOPS(seqs, width=width)
  }
  if(type == "variable") {
    model <- factory.OOPS(seqs, width=width)
  }
  if(type == "optional") {
    model <- factory.ZOOPS(seqs, width=width)
  }

  convergence <- 10
  while(convergence > 10**-3) {
    temp <- EMStep(model,seqs)
    convergence <- sum((model@mmodel - temp@mmodel)**2 / (model@mmodel)**2)
    model <- temp
  }
  
  return(model)
}

predict.MotifModel <- function(object, newSequences=object@seqs, ...) {

  logodds <- matrix(object@mmodel, ncol=object@width)
  for(i in 1:nrow(logodds)) {
    logodds[i,] <- log(logodds[i,] / object@bmodel[i])
  }

  result <- rep(NA, nrow(newSequences))
  for(i in 1:length(result)) {
    max <- -10**200
    for(j in 1:(ncol(newSequences) -  object@width)) {
      lsum <- 0
      if((ncol(newSequences) -  object@width) == 0) {
        for(k in 1:ncol(newSequences)) {
          if(newSequences[i,k] < length(newSequences@alphabet)) {
            lsum <- lsum + logodds[newSequences[i,k], k]
          }
          else {
            lsum <- max - 1
            break
          }
        }
      }
      else {
        for(k in j:(j + object@width - 1)) {
          if(newSequences[i,k] < length(newSequences@alphabet)) {
            lsum <- lsum + logodds[newSequences[i,k],k - j + 1]
          }
          else {
            lsum <- max - 1
            break
          }
        }
      }
      if(lsum > max) {
        max <- lsum
      }
    }
    result[i] <- max
  }

  return(result)
}


predict.MotifModelSet <- function(model, newSequences) {

  result <- rep(NA, nrow(newSequences))

  prds <- matrix(rep(NA, length(model@motifs) * nrow(newSequences)), nrow=nrow(newSequences))
  
  for(i in 1:length(model@motifs)) {
    prds[,i] <- predict(model@motifs[[i]], newSequences)
  }
  
  for(i in 1:nrow(newSequences)) {

    result[i] <- max(prds[i,])
    
  }
  
  return(result)
}

classify.MotifModelSet <- function(motifSet, newSequences, threshold = 0) {

  predictions <- lapply(motifSet@motifs, FUN=function(x) {predict(x, newSequences)})
  pmatrix <- matrix(rep(NA, nrow(newSequences) * length(motifSet@motifs)), nrow=nrow(newSequences))

  for(i in 1:length(motifSet@motifs)){
    pmatrix[,i] <- predictions[[i]]
  }
 
  classes <- apply(pmatrix, MARGIN=1, FUN=function(x) {
    if(is.na(threshold) || max(x) > threshold) which.max(x)
    else -1
  })
  
  return(classes)
}


setGeneric("classify", function(model, ...) standardGeneric("classify"))
setMethod("classify", "MotifModelSet", function(model,...) classify.MotifModelSet(model,...))

setGeneric("predict")
setMethod("predict", "MotifModel", function(object,...) predict.MotifModel(object,... ))
setMethod("predict", "MotifModelSet", function(object,...) predict.MotifModelSet(object,... ))

EM.SSOOPS.Linked <- function(fullSeqMatrix, seqids, models) {

  bcounts <- rep(1, length(models[[1]]@bmodel))
  bsum <- 0
  
  for(mindex in 1:length(models)) {

    model <- models[[mindex]]
    seqmatrix <- fullSeqMatrix[seqids[[mindex]],]@.Data
    
    m <- (ncol(seqmatrix) - model@width + 1)
    n <- nrow(seqmatrix)
    ztrial <- rep(1., m)
    mcounts <- matrix(rep(0,length(model@mmodel)), dim(model@mmodel))
    names(bcounts) <- aabet
    rownames(mcounts) <- aabet
    
    msums <- rep(0, ncol(mcounts))
    psum <- 0
    
    for(i in 1:n) {
      for(j in 1:m) {
        for(k in 1:ncol(seqmatrix)) {
          if(k >= j && k < j + model@width) {
            ztrial[j] <- ztrial[j] * model@mmodel[seqmatrix[i,k],k - j + 1]
          }
          else {
            ztrial[j] <- ztrial[j] *  model@bmodel[ seqmatrix[i,k] ]
          }
          
        }
      }
      if(sum(ztrial) > 0.) {
        ztrial <- ztrial / sum(ztrial)
      }
    }

    
    for(i in 1:n) {
      for(j in 1:m) {
        for(k in 1:ncol(seqmatrix)) {
          if(k >= j && k < j + model@width) {
            mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] +  ztrial[j]
            msums[k - j + 1] <- msums[k - j + 1] + ztrial[j]
          } else {
            bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[j]
            bsum <- bsum + 1. - ztrial[j]
          }
        }
      }
    }
    
    for(i in 1:nrow(model@mmodel)) {
      for(j in 1:ncol(model@mmodel)) {
        models[[mindex]]@mmodel[i, j] <- mcounts[i,j] / msums[j]
      }
    }
    
    models[[mindex]]@zvector <- ztrial
    
  }

  #I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
#  for(i in 1:length(models)) {
#    for(j in 1:length(models[[i]]@bmodel)) {
#      models[[i]]@bmodel[j] <- bcounts[j] / bsum      
#    }
#  }

  return(models)
  
}


EMStep.SSOOPS <- function(model, seqs) {

  alphabet <- seqs@alphabet

  seqmatrix <- seqs@.Data

  m <- (ncol(seqmatrix) - model@width + 1)
  n <- nrow(seqmatrix)
  
  pseudocount <- length(alphabet)
  if(pseudocount > nrow(seqmatrix)) {
    pseudocount <- nrow(seqmatrix)
  }
  
  ztrial <- rep(1., m)
  bcounts <- rep(pseudocount / length(alphabet),length(model@bmodel))
  mcounts <- matrix(rep(pseudocount / length(alphabet),length(model@mmodel)), dim(model@mmodel))
  names(bcounts) <- alphabet
  rownames(mcounts) <- alphabet

  bsum <- pseudocount
  msums <- rep(pseudocount, ncol(mcounts))
  psum <- 0

  for(i in 1:n) {
    for(j in 1:m) {
      for(k in 1:ncol(seqmatrix)) {
	if(k >= j && k < j + model@width && seqmatrix[i,k] <= nrow(mcounts)) {
          ztrial[j] <- ztrial[j] * model@mmodel[seqmatrix[i,k],k - j + 1]
	}
	else {
          ztrial[j] <- ztrial[j] *  model@bmodel[ seqmatrix[i,k] ]
	}
        
      }
    }
    ztrial <- ztrial / sum(ztrial)
  }
  
  for(i in 1:n) {
     for(j in 1:m) {
       for(k in 1:ncol(seqmatrix)) {
         if(k >= j && k < j + model@width) {
           if(seqmatrix[i,k] <= nrow(mcounts)) {
             mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] +  ztrial[j]
             msums[k - j + 1] <- msums[k - j + 1] + ztrial[j]
           }
          } else {
  	      bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[j]
              bsum <- bsum + 1. - ztrial[j]
          }
       }
 }
   }
  #I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
#  for(i in 1:length(model@bmodel)) {
#    model@bmodel[i] <- bcounts[i] / bsum
#  }
  
  for(i in 1:nrow(model@mmodel)) {
    for(j in 1:ncol(model@mmodel)) {
      model@mmodel[i, j] <- mcounts[i,j] / msums[j]
    }
  }
   
   model@zvector <- ztrial

   return(model)
}


EMStep.OOPS <- function(model, seqs) {

  alphabet <- seqs@alphabet
  seqmatrix <- seqs@.Data
  
  m <- (ncol(seqmatrix) - model@width + 1)
  n <- nrow(seqmatrix)

  pseudocount <- length(alphabet)
  if(pseudocount > nrow(seqmatrix)) {
    pseudocount <- nrow(seqmatrix)
  }
  
  ztrial <- matrix(rep(1, m * n), nrow=n, ncol=m)
  bcounts <- rep(pseudocount / length(alphabet),length(model@bmodel))
  mcounts <- matrix(rep(pseudocount / length(alphabet), length(model@mmodel)), dim(model@mmodel))
  names(bcounts) <- alphabet
  rownames(mcounts) <- alphabet

  bsum <- pseudocount
  msums <- rep(pseudocount, ncol(mcounts))


  for(i in 1:n) {
    psum <- 0
    for(j in 1:m) {
      for(k in 1:ncol(seqmatrix)) {
	if(k >= j && k < j + model@width && seqmatrix[i,k] <= nrow(mcounts)) {
            ztrial[i,j] <- ztrial[i,j] * model@mmodel[seqmatrix[i,k],k - j + 1]
	}
	else {
          ztrial[i,j] <- ztrial[i,j] * model@bmodel[ seqmatrix[i,k] ]
	}
      }
      psum <- psum + ztrial[i,j]
    }
    ztrial[i,] <- ztrial[i,] / psum
     for(j in 1:m) {
       for(k in 1:ncol(seqmatrix)) {
         if(k >= j && k < j + model@width) {
           if(seqmatrix[i,k] <= nrow(mcounts)) {
             mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] +  ztrial[i,j]
             msums[k - j + 1] <- msums[k - j + 1] + ztrial[i,j]
           }
          } else {
  	      bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[i,j]
              bsum <- bsum + 1. - ztrial[i,j]
          }
       }
     }
  }
  #I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
#  for(i in 1:length(model@bmodel)) {
#    model@bmodel[i] <- bcounts[i] / bsum
# }
  for(i in 1:nrow(model@mmodel)) {
    for(j in 1:ncol(model@mmodel)) {
      model@mmodel[i, j] <- mcounts[i,j] / msums[j]
    }
   }
   
   model@zmatrix <- ztrial

   return(model)
}

EMStep.ZOOPS <- function(model, seqs) {

  alphabet <- seqs@alphabet
  seqmatrix <- seqs@.Data
 

  m <- (ncol(seqmatrix) - model@width + 1)
  n <- nrow(seqmatrix)

  pseudocount <- length(alphabet)
  if(pseudocount > nrow(seqmatrix)) {
    pseudocount <- nrow(seqmatrix)
  }
  
  ztrial <- matrix(rep(1, m * n), nrow=n, ncol=m)
  bcounts <- rep(pseudocount / length(alphabet),length(model@bmodel))
  mcounts <- matrix(rep(pseudocount / length(alphabet),length(model@mmodel)), dim(model@mmodel))
  names(bcounts) <- alphabet
  rownames(mcounts) <- alphabet

  bsum <- pseudocount
  msums <- rep(pseudocount, ncol(mcounts))


  for(i in 1:n) {
    psum = 0
    noseq <- 1
    #calcultate probability of no occurence of the motif
    for(j in 1:ncol(seqmatrix)) {
      noseq <- noseq * model@bmodel[ seqmatrix[i,j] ]
    }
    for(j in 1:m) {
      for(k in 1:ncol(seqmatrix)) {
	if(k >= j && k < j + model@width && seqmatrix[i,k] <= nrow(mcounts)) {
          ztrial[i,j] <- ztrial[i,j] * model@mmodel[seqmatrix[i,k],k - j + 1]
	}
	else {
          ztrial[i,j] <- ztrial[i,j] * model@bmodel[ seqmatrix[i,k] ]
	}
      }
      psum <- psum + ztrial[i,j]
    }

    ztrial[i,] <- ztrial[i,] * model@gamma / (psum * model@gamma + noseq * m * (1 - model@gamma))
    model@qarray[i] <- sum(ztrial[i,])

     for(j in 1:m) {
       for(k in 1:ncol(seqmatrix)) {
         if(k >= j && k < j + model@width) {
           if(seqmatrix[i,k] <= nrow(mcounts)) {
             mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] +  ztrial[i,j]
             msums[k - j + 1] <- msums[k - j + 1] + ztrial[i,j]
           }
          } else {
	    bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[i,j]
            bsum <- bsum + 1. - ztrial[i,j]
          }
       }
     }
  }
  #I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
#  for(i in 1:length(model@bmodel)) {
#    model@bmodel[i] <- bcounts[i] / bsum
#  }
  for(i in 1:nrow(model@mmodel)) {
    for(j in 1:ncol(model@mmodel)) {
      model@mmodel[i, j] <- mcounts[i,j] / msums[j]
    }
   }
   
   model@zmatrix <- ztrial

   model@gamma <- sum(model@qarray) / nrow(seqmatrix)

  
   return(model)
}


setGeneric("EMStep", def = function(model, seqs,...) standardGeneric("EMStep"))
setMethod("EMStep", "OOPS", EMStep.OOPS)
setMethod("EMStep", "SSOOPS", EMStep.SSOOPS)
setMethod("EMStep", "ZOOPS", EMStep.ZOOPS)

logLik.OOPS <- function(model) {

  logLik <- 0
  for(i in 1:nrow(model@seqs)) {
    pseq <- 0
    for(j in 1:ncol(model@seqs)) {
      if(j + model@width - 1 <= ncol(model@seqs)) {
        pj <- 0
        for(k in 1:model@width) {
          pj <- pj  + model@zmatrix[i,j] * log(model@mmodel[model@seqs@.Data[i,j + k - 1], k])
        }
        pseq <- pseq + pj
      }#PUT BACKGROUND BACK IN
    }
    logLik <- logLik + pseq
  }
  names(logLik) <- NULL
  return(logLik)
}

logLik.SSOOPS <- function(model) {

  logLik <- 0
  for(i in 1:nrow(model@seqs)) {
    pseq <- 0
    for(j in 1:ncol(model@seqs)) {
      if(j + model@width - 1 <= ncol(model@seqs)) {
        pj <- 0
        for(k in 1:ncol(model@seqs)) {
          if(k >= j && k < j + model@width) {
            pj <- pj  + model@zvector[j] * log(model@mmodel[model@seqs@.Data[i,k], k -j + 1])
          }
          else {
            pj <- pj + model@zvector[j] * log(model@bmodel[model@seqs@.Data[i,k]])
          }
        }
        pseq <- pseq + pj
      }
    }
    logLik <- logLik + pseq
  }
  names(logLik) <- NULL
  
  return(logLik)
}

logLik.ZOOPS <- function(model) {

  logLik <- 0
  for(i in 1:nrow(model@seqs)) {

    pseq <- 0
    #liklihood of sequence occuring
    for(j in 1:ncol(model@seqs)) {
      if(j + model@width - 1 <= ncol(model@seqs)) {
        pj <- 0
        for(k in 1:ncol(model@seqs)){
          if(k >= j && k < j + model@width) {
            pj <- pj  + model@zmatrix[i,j] * log(model@mmodel[model@seqs@.Data[i,k], k - j + 1])
          }
          else {
            pj <- pj + model@zmatrix[i,j] * log(model@bmodel[model@seqs@.Data[i,k]])
          }
        }
        pseq <- pseq + pj
      }
    }
    logLik <- logLik + pseq
    #liklihood of peptide being background
    for(j in 1:ncol(model@seqs)) {
      logLik <- logLik + (1 - model@qarray[i]) * log(model@bmodel[model@seqs@.Data[i,k]])
    }
  }
  names(logLik) <- NULL
  return(logLik)
}

BIC.MotifModel <- function(object) {

  motif <- object
  logLik <- logLik(motif)
  k <- k + motif@np
  n <- n + motif@seqs
  
  return(-2 * logLik + k * log(n))
}

BIC.MotifModelSet <- function(object) {

  mset <- object
  logLik <- 0
  k <- 0 #Number of parameters
  n <- 0 #sample size
  for(i in 1:length(mset@motifs)) {
    logLik <- logLik + logLik(mset@motifs[[i]])
    k <- k + mset@motifs[[i]]@np
    n <- n + nrow(mset@motifs[[i]]@seqs)
  }
  
  return(-2 * logLik + k * log(n))
}

logLik.MotifModelSet <- function(object) {


  mset <- object
  logLik <- 0
  for(i in 1:length(mset@motifs)) {
    logLik <- logLik + logLik(mset@motifs[[i]])
  }
  
  return(logLik)
}

setGeneric("logLik")
setMethod("logLik", "OOPS", function(object, ...) logLik.OOPS(object,...))
setMethod("logLik", "ZOOPS", function(object, ...) logLik.ZOOPS(object, ...))
setMethod("logLik", "SSOOPS", function(object, ...) logLik.SSOOPS(object, ...))
setMethod("logLik", "MotifModelSet", function(object, ...) logLik.MotifModelSet(object, ...))
setGeneric("BIC")
setMethod("BIC", "MotifModelSet", function(object) BIC.MotifModelSet(object))
setMethod("BIC", "MotifModel", function(object) BIC.MotifModel(object))


factory.OOPS <- function(seqs, width=ncol(seqs)) {
  alphabet <- seqs@alphabet
  
  model <- new("OOPS",
               zmatrix=matrix(rep(c(1,rep(0,ncol(seqs) - width + 1)), nrow(seqs)), nrow=nrow(seqs)),
               mmodel=matrix(rep(1.0/length(alphabet),length(alphabet) * width), ncol=width),
               bmodel=rep(1.0/length(alphabet),length(alphabet)), width=as.integer(width),
               seqs=seqs)

  model@np <- length(model@zmatrix) + length(model@mmodel)
  
  names(model@bmodel) <- alphabet
  rownames(model@mmodel) <- alphabet
  return(model)
}

factory.SSOOPS <- function(seqs, width=ncol(seqs)) {
  alphabet <- seqs@alphabet

  model <- new("SSOOPS",
               zvector=rep(1./(ncol(seqs) - width + 1.), ncol(seqs) - width + 1),
               mmodel=matrix(rep(1.0/length(alphabet),length(alphabet) * width), ncol=width),
               bmodel=rep(1.0/length(alphabet),length(alphabet)),
               width=as.integer(width),
               seqs=seqs)
  
  model@np <- length(model@zvector) + length(model@mmodel)
  
  names(model@bmodel) <- alphabet
  rownames(model@mmodel) <- alphabet
  return(model)
}

factory.ZOOPS <- function(seqs, width=ncol(seqs)) {
  alphabet <- seqs@alphabet

  model <- new("ZOOPS",
               zmatrix=matrix(rep(c(1,rep(0,ncol(seqs) - width + 1)), nrow(seqs)), nrow=nrow(seqs), byrow=T),
               mmodel=matrix(rep(1.0/length(alphabet),length(alphabet) * width), ncol=width),
               bmodel=rep(1.0/length(alphabet),length(alphabet)),
               width=as.integer(width),
               gamma=0.5,
               qarray=rep(0.5,nrow(seqs)),
               seqs=seqs)
  model@np <- length(model@zmatrix) + length(model@mmodel) + length(model@qarray)
  names(model@bmodel) <- alphabet
  rownames(model@mmodel) <- alphabet
  return(model)
}


print.Sequences <- function(seqs) {
  
}

print.MotifModel <- function(x,...) {

  model <- x
  
  #find where the motif begins
  ncolz <- 0
  if(class(model) == "SSOOPS") {
    zsum <- model@zvector
    ncolz <- length(model@zvector)
  } else {
    zsum <- apply(model@zmatrix, MARGIN=2, FUN=sum)
    ncolz <- ncol(model@zmatrix)
  }
  start <- which.max(zsum)

  mnum <- 4
  cut <- 0.15

  motifMode <- matrix(rep("", model@width * mnum), nrow=mnum)
  for(i in 1:ncol(model@mmodel)) {
    csort <- order(model@mmodel[,i], decreasing=T)
    motifMode[1,i] <- rownames(model@mmodel)[csort[1]]
    for(j in 2:mnum) {
      if(model@mmodel[csort[j],i] > cut) {
        motifMode[j,i] <- rownames(model@mmodel)[csort[j]]
      }
    }
  }
  motifModeStr <- apply(motifMode, MARGIN=2, FUN=function(x) {paste("[",paste(x,collapse=""),"]", sep="")})

  seqMode <- c(rep("-", start - 1), motifModeStr, rep("-", ncolz - start ))
  cat(paste("Number of sequences:", nrow(model@seqs)))
  cat(paste("\nMotif Mode:", paste(seqMode, collapse="")))
  cat("\n\nBackground:\n")
  print(sort(model@bmodel, decreasing=T)[1:5])
  
}

MotifModel.motifString <- function(model)  {

  #Check for trivial case
  if(nrow(model@seqs) == 1) {
    return(model@seqs[1])
  }
  
  #find where the motif begins
  ncolz <- 0
  if(class(model) == "SSOOPS") {
    zsum <- model@zvector
    ncolz <- length(model@zvector)
  } else {
    zsum <- apply(model@zmatrix, MARGIN=2, FUN=sum)
    ncolz <- ncol(model@zmatrix)
  }
  start <- which.max(zsum)

  mnum <- 4
  cut <- 0.15

  motifMode <- matrix(rep("", model@width * mnum), nrow=mnum)
  for(i in 1:ncol(model@mmodel)) {
    csort <- order(model@mmodel[,i], decreasing=T)
    motifMode[1,i] <- rownames(model@mmodel)[csort[1]]
    for(j in 2:mnum) {
      if(model@mmodel[csort[j],i] > cut) {
        motifMode[j,i] <- rownames(model@mmodel)[csort[j]]
      }
    }
  }
  motifModeStr <- apply(motifMode, MARGIN=2, FUN=function(x) {paste("[",paste(x,collapse=""),"]", sep="")})

  seqMode <- c(rep("-", start - 1), motifModeStr, rep("-", ncolz - start ))
  return(paste(seqMode, collapse=""))
   
}

setGeneric("print")
setMethod("print", "MotifModel", function(x,...) print.MotifModel(x))

setGeneric("motifString", def = function(x,...) standardGeneric("motifString"))
setMethod("motifString", "MotifModel", function(x,...) MotifModel.motifString(x))


plot.MotifModelSet <- function(x,...) {

  model <- x
  clusterNumber <- length(model@motifs)

  clusters <- vector("list", clusterNumber)
  legendText <- vector("character", clusterNumber)
  
  seqs.data <- matrix(0,nrow=sum(sapply(1:clusterNumber, function(x) nrow(model@motifs[[x]]@seqs))), ncol=ncol(model@motifs[[1]]@seqs))

  counter <- 1
  for(i in 1:clusterNumber) {
    clusters[[i]] <- counter:(counter - 1 + nrow(model@motifs[[i]]@seqs))
    counter <- counter + nrow(model@motifs[[i]]@seqs)
    seqs.data[clusters[[i]],] <- model@motifs[[i]]@seqs@.Data
    legendText[i] <- motifString(model@motifs[[i]])


  }

  seqs <- new("Sequences", seqs.data,alphabet=model@motifs[[1]]@seqs@alphabet)


  colors <- rep("black", nrow(seqs))
  shades <- hcl(h=1:clusterNumber * (360 / clusterNumber), c=90, l=70 )
  for(i in 1:length(clusters)) {
    colors[clusters[[i]]] <- shades[i]
  }

  fit <- prcomp(dist(seqs))
  x <- fit$x[,1]
  y <- fit$x[,2]
  par(mar=c(5,4,2,5))
  plot(x,y,xlab="Component 1", ylab="Component 2", col=colors)

  legend(max(x) / 2, max(y) * 1.25, legendText, col=shades,text.col="black", pch=rep(1, clusterNumber), xpd=TRUE)


  
}

MotifModel.plotStartingPosition <- function(motifModel) {

  par(fg="dark gray")
  if(class(motifModel) == "SSOOPS") {
    zsum <- motifModel@zvector
  } else {
    zsum <- apply(motifModel@zmatrix, MARGIN=2, FUN=sum)
  }
  barplot(zsum / sum(zsum), names.arg=as.character(1:length(zsum)), main="", col=hcl(h=1:length(zsum) * (360 / length(zsum))), border="black")

}

MotifModel.plotPositions <- function(motifModel) {

  par(mfrow=c(ceiling(motifModel@width / 2),2), mar=c(3,3,2,2), cex=0.7)
  for(i in 1:motifModel@width) {
    barx <- barplot(motifModel@mmodel[,i], main=paste("Position", i), col=hcl(h=1:ncol(motifModel@seqs) * (360 / ncol(motifModel@seqs))), border=NA, xaxt="n", ylim=c(0,max(motifModel@mmodel[,i]) * 1.3))
    #Find the highest three bars and label those
    morder <- rev(order(motifModel@mmodel[,i]))[1:3]
    text(barx[morder], motifModel@mmodel[morder,i], rownames(motifModel@mmodel)[morder],
         pos=3, col="black")
  }
  par(mfrow=c(1,1))
}

MotifModel.plotFits <- function(motifModel) {

  
  predictions <- sort(predict(motifModel), decreasing=T)
  ncol <- 50
  colsGood <- colorRampPalette(c("white", "green", "green"))(50)
  colsBad <-  colorRampPalette(c("red", "red", "white"))(50)
  cols <- c()
  if(sum(predictions > 0) > 0) {
    cols <- c(colsGood[cut(predictions[predictions > 0], breaks=ncol, labels=F)])
  }
  if(sum(predictions < 0) > 0) {
    cols <- c(cols, colsBad[cut(predictions[predictions < 0], breaks=ncol, labels=F)])
  }
  barplot(predictions, names.arg=NULL, main="", col=cols)

}

setGeneric("plotFits", function(motifModel) standardGeneric("plotFits"))
setMethod("plotFits", "MotifModel", MotifModel.plotFits)

setGeneric("plotStartingPosition", function(motifModel) standardGeneric("plotStartingPosition"))
setMethod("plotStartingPosition", "MotifModel", MotifModel.plotStartingPosition)

setGeneric("plotPositions", function(motifModel) standardGeneric("plotPositions"))
setMethod("plotPositions", "MotifModel", MotifModel.plotPositions)

setMethod("plot", "MotifModelSet", function(x, y, ...) plot.MotifModelSet(x,...))

plot.MotifModel <- function(x,...) {


  if((class(x) == "SSOOPS" && x@width < length(x@zvector)) || (class(x) != "SSOOPS" && x@width < nrow(x@zmatrix))){
    #plot motif start
    if(class(x) == "SSOOPS") {
      zsum <- x@zvector
    } else {
      zsum <- apply(x@zmatrix, MARGIN=2, FUN=sum)
    }
    barplot(zsum, names.arg=as.character(1:length(zsum)), main=paste("Motif Starting Position"), col="gray30")
  }
  
  par(mfrow=c(3,1), mar=c(3,3,2,2), ask=TRUE)
  for(i in 1:x@width) {
    barplot(x@mmodel[,i], main=paste("Motif Position", i), col="gray30")
  }
  par(mfrow=c(1,1))
  predictions <- sort(predict(x), decreasing=T)
  ncol <- 50
  colsGood <- colorRampPalette(c("white", "green", "green"))(50)
  colsBad <-  colorRampPalette(c("red", "red", "white"))(50)
  cols <- c()
  if(sum(predictions > 0) > 0) {
    cols <- c(colsGood[cut(predictions[predictions > 0], breaks=ncol, labels=F)])
  }
  if(sum(predictions < 0) > 0) {
    cols <- c(cols, colsBad[cut(predictions[predictions < 0], breaks=ncol, labels=F)])
  }
  barplot(predictions, names.arg=NULL, main="Log Odds (Fit)", col=cols)
}

setMethod("plot", "MotifModel", function(x, y, ...) plot.MotifModel(x,...))

aclust <- function(sDistMatrix, clusterNumber, verbose=T, type="kmeans", knstart=20) {	   
  
  if(class(sDistMatrix) == "Sequences") {
    sDistMatrix <- dist(sDistMatrix)
  }

  if(type != "kmeans" && type != "agglomerative") {
    stop("type must be either \"kmeans\" or \"agglomerative\"")
  }
  
  if(type == "kmeans") {

    km <- kmeans(sDistMatrix, centers=clusterNumber, nstart=knstart)
    result <- vector(mode="list", length=clusterNumber)
    for(i in 1:clusterNumber) {
      result[[i]] <- which(km$cluster == i)
    }

    return(result)
    
  }
  
  
  if(class(sDistMatrix) == "dist") {
    N <- attr(sDistMatrix,"Size")
    temp <- matrix(rep(NA, N ** 2), nrow=N, ncol=N)
    for(i in 1:(nrow(temp) - 1)) {
      for(j in (i + 1):ncol(temp)) {
        temp[i,j] <- sDistMatrix[N * (i - 1) - i  * (i - 1) / 2 + j - i]
        temp[j,i] <- temp[i,j]
      }
    }

    sDistMatrix <- temp
    
  }

  if(clusterNumber == 1) {
    return(list(1:nrow(sDistMatrix)))
  }

  if(verbose) {
    cat("Finding clusters...\n")
  }

  cn <- nrow(sDistMatrix)

  #while cluster number is greater than clusterNumber
  clusters <- as.list(1:nrow(sDistMatrix))

  while(cn > clusterNumber && cn > 1) {

    #Find minimum element in sDistMatrix
    wmin <- findMinDistElement(sDistMatrix)

    #find which cluster that element refers to
    cA <- 0
    cB <- 0
    for(i in 1:length(clusters)) {
      if(wmin[1] %in% clusters[[i]] || wmin[2] %in% clusters[[i]]) {
        if(cA == 0) {
	  cA <- i
	}else{
	  cB <- i
	}
      }
      if(cA != 0 && cB != 0) {
       break
      }
    }

   

   #clobber rows/columns in sDistMatrix with minimum element
   ca1 <- clusters[[cA]][1]
   cb1 <- clusters[[cB]][1]
   for(i in 1:ncol(sDistMatrix)) {

     #is this a within cluster distance?
     if(!(i %in% c(clusters[[cA]], clusters[[cB]]))) {
       cmin <- cA
       cmax <- cB
       if(sDistMatrix[min(ca1,i), max(ca1,i)] < sDistMatrix[min(cb1,i), max(cb1,i)]) {
         cmin <- cB
         cmax <- cA
       }
       for(j in 1:length(clusters[[cmax]])) {
         sDistMatrix[min(clusters[[cmax]][j], i), max(clusters[[cmax]][j], i)] <- sDistMatrix[min(clusters[[cmin]][1], i), max(clusters[[cmin]][1], i)]
       }
     }
  
   }

  
    #set within cluster distance to NA
    for(i in 1:length(clusters[[cA]])) {
      for(j in 1:length(clusters[[cB]])) {
        if(clusters[[cA]][i] > clusters[[cB]][j]) {
          sDistMatrix[clusters[[cB]][j],clusters[[cA]][i]] <- NA
	} else {
	  sDistMatrix[clusters[[cA]][i],clusters[[cB]][j]] <- NA
	}
      }
    }

    #join two ID sets
  
    clusters[[cA]] <- c(clusters[[cA]], clusters[[cB]])
    clusters <- clusters[-cB]

    cn <- length(clusters)
    if(verbose) {
      cat("\r")
      cat(paste(cn, "   "))
    }

  }

  return(clusters)

}


changeClusterFormat <- function(clusterList) {
  seqnumber <- 0
  for(i in 1:length(clusterList)) {
    seqnumber <- seqnumber + length(clusterList[[i]])
  }
  clusters <- rep(0,seqnumber)
  for(i in 1:seqnumber) {
    for(j in 1:length(clusterList)) {
      if(i %in% clusterList[[j]]) {
        clusters[i] = j
        break
      }
    }
  }
  return(clusters)
}

findMinDistElement <- function(sDistMatrix) {

  minimum <- NA			
  wmin <- c(0,0)

  for(i in 1:(nrow(sDistMatrix) - 1)) {
    for(j in (i + 1):ncol(sDistMatrix)) {
      if(is.na(minimum) || (!is.na(sDistMatrix[i,j]) &&  sDistMatrix[i,j] < minimum)) {
        wmin <- c(i,j)
	minimum <- sDistMatrix[i,j]
      }
    }
  }
  return(wmin)

}

simpleDescriptors <- function(seqs, response=numeric(0), include.statistics=FALSE) {

  desc <- descriptors(seqs, response, base.frame=defaultBaseMatrix[,c("count.BasicGroups", "count.AcidicGroups", "count.PolarGroups", "count.NonPolarGroups", "count.AromaticGroups", "count.ChargedGroups",  "ALogP")], do.var=F,
  alags=c(), do.counts=F, do.mean=T, do.position=F,
  include.statistics=include.statistics, accuracy=0.001)


  return(desc)
  
}

descriptors <- function(seqs, response=numeric(0), base.frame=NA, do.var=TRUE, alags=c(1,2,3), do.mean=TRUE, do.counts=FALSE, do.position=TRUE, alphabet=seqs@alphabet, include.statistics=TRUE, accuracy=0.01) {

  
  if(include.statistics) {
    if(ncol(seqs) >= 10) {
      cat("Warning: the sequence space is very large for calculating statistics")
    }
  }
  
  if(length(base.frame) == 1 && is.na(base.frame)) {
    base.frame <- defaultBaseMatrix
  }

  base.matrix <- data.matrix(base.frame)


  #For speed, create matrix vesion of sequences
  seqmatrix <- data.matrix(seqs@.Data)
  
  
  #get number of descriptors
  base.num <- ncol(base.frame)
  multiplier <- 0
  if(!is.null(alags)) {
    multiplier <- length(alags)
  }
  if(do.var)
    multiplier <- multiplier + 1
  if(do.mean)
    multiplier <- multiplier + 1
  if(do.position)
    multiplier <- multiplier + ncol(seqmatrix)


  desc.num <- base.num * multiplier
  if(do.counts)
    desc.num <- desc.num + length(alphabet)

  #Build the names first
  dnames <- rep("",desc.num)
  index <- 1
  for(i in 1:base.num) {
    if(do.mean) {
      dnames[index] <- paste(names(base.frame)[i], 'AVG', sep=".")
      index <- index + 1
    }
    if(do.var) {
      dnames[index] <- paste(names(base.frame)[i], 'VAR', sep=".")
      index <- index + 1
    }
    if(!is.null(alags)) {
      for(j in 1:length(alags)) {
        dnames[index] <- paste(names(base.frame)[i], 'AUTO', alags[j], sep=".")
        index <- index + 1
      }
    }
    if(do.position) {
      for(j in 1:ncol(seqmatrix)) {
        dnames[index] <- paste(names(base.frame)[i], 'P',j,sep=".")
        index <- index + 1
      }
    }
  }

  if(do.counts) {
    count.start.index <- index
    for(i in 1:length(alphabet)) {
      dnames[index] <- paste('NUM', alphabet[i], sep=".")
      index <- index + 1
    }
  }
  

  
  #Now calculate the descriptors
  desc <- empty.matrix(dnames, rownames(seqmatrix))
  finished <- 0
  res <- rep(NA, base.num * multiplier)
  for(i in 1:nrow(seqmatrix)) {
    index <- 1
    for(j in 1:base.num) {
      
      cur.desc <- base.matrix[seqmatrix[i,], j]
      cur.mean <- mean(cur.desc)
      cur.var <- var(cur.desc)
      
      if(do.mean) {
        res[index] <- cur.mean
        index <- index + 1
      }
      if(do.var) {
        res[index] <- cur.var
        index <- index + 1
      }
      if(!is.null(alags)) {
        for(k in 1:length(alags)) {
          res[index] <- autocorrelation(cur.desc, cur.mean, cur.var, lag=alags[k])
          index <- index + 1
        }
      }
      if(do.position){
        for(k in 1:ncol(seqmatrix)) {
          res[index] <- cur.desc[k]
          index <- index + 1
        }
      }
    }
    if(do.counts) {
      for(j in 1:length(alphabet)) {
        res[index] <- sum(seqmatrix[i,] == j)
        index <- index + 1
      }
    }
    desc[i,] <- res
  }

  cat("\n")


  #Remove non-varying descriptors
  desc.var <- apply(desc, MARGIN=2, FUN=var)

  if(sum(which(desc.var == 0)) > 0) {
    if(do.counts) {
      if(sum(which(desc.var[1:count.start.index] == 0)) > 0) {
        desc <- desc[-which(desc.var[1:count.start.index] == 0)]
      }
    }
    else {
      desc <- desc[-which(desc.var == 0)]
    }
  }

  desc <- factory.descriptor(desc)
  desc@response=response


  #now, calculate means for each row
  if(include.statistics) {

    cat("Standard Error and variances are currently unimplemented\nCalculating Means..\n.")

    dseqs <- decoys(seqs, 500)
    ddesc <- matrix(0, nrow=nrow(dseqs), ncol=ncol(desc))
    for(i in 1:nrow(dseqs)) {
      index <- 1
      for(j in 1:base.num) {
        cat(paste("\r", format((base.num * multiplier * (i- 1) + index) * 100 / (base.num * multiplier * nrow(dseqs)),width=4, digits=3), "%        "))

        cur.desc <- base.matrix[dseqs[i,], j]
        cur.mean <- mean(cur.desc)
        cur.var <- var(cur.desc)

        if(do.mean) {
          ddesc[i, index] <- cur.mean
          index <- index + 1
        }
        if(do.var) {
          ddesc[i, index] <- cur.var
          index <- index + 1
        }
        if(!is.null(alags)) {
          for(k in 1:length(alags)) {
            ddesc[i, index] <- autocorrelation(cur.desc, cur.mean, cur.var, lag=alags[k])
            index <- index + 1
          }
        }
        if(do.position){
          for(k in 1:ncol(dseqs)) {
            ddesc[i, index] <- cur.desc[k]
            index <- index + 1
          }
        }
      }
      if(do.counts) {
        for(k in 1:length(alphabet)) {
          ddesc[i, index] <- sum(dseqs[i,] == k)
          index <- index + 1
        }
      }
    }
    cat("\n")

    #Calculate estimated p-values, the amount of overlap between the two distributions using Mann-Whitney test
    index <- 1
    for(i in 1:ncol(ddesc)) {
      if(desc.var[i] != 0) {
        x <- ddesc[,i]
        y <- desc[,i]
        p.value <- wilcox.test(x,y)$p.value
        desc@pvalues[index] <- p.value
        index <- index + 1
      }
    }    
  }
  
  
  return(desc)
}

#Calculate the autocorrelation of the given function on the data with the given lag
autocorrelation <- function(data,ef=mean(data), v=var(data), lag=1) {

  if(lag >= length(data)) {
    return(0)
  }

  if(v == 0) {
    return(0)
  }

  numer <- 0
  for(i in 1:(length(data) - lag)) {
    numer <- numer + (data[i] - ef) * (data[i + lag] - ef)
  }
  numer = numer / (length(data) - lag)

  return(numer / v)
  
}

                                    
factory.descriptor <- function(data) {

  d <- new("Descriptors", data.frame(data@.Data), row.names=rownames(data), names=colnames(data), response=numeric(0), pvalues=rep(0, ncol(data)))
  names(d@pvalues) <- colnames(data)
  return(d)
  
}

empty.df <- function(cnames, rnames, default=NA) {
  df<-data.frame(matrix(rep(default,length(cnames)*length(rnames)), nrow=length(rnames)))
  colnames(df)<-cnames
  rownames(df) <- rnames
  return(df)
}

empty.matrix <- function(cnames, rnames, default=NA) {
  df<-matrix(rep(default,length(cnames)*length(rnames)), nrow=length(rnames))
  colnames(df)<-cnames
  rownames(df) <- rnames
  return(df)
}


plot.Descriptors <- function(desc,...) {

  #correlation

  if(length(desc@response) > 0) {    

    desc.cor <- cor(desc, desc@response)
    hist(desc.cor, main="Histogram of descriptor/response correlations", col="gray30")


    desc.cor.sorted <- order(abs(desc.cor), decreasing=T)
    par(cex.axis=0.7)
    barplot(desc.cor[desc.cor.sorted[1:10]], names.arg=rownames(desc.cor)[desc.cor.sorted[1:10]], ylab="Correlation", col="gray30")
    par(cex.axis=1)
    
  }

  desc.pc <- prcomp(desc, scale=T, center=T, tol=0.01, retx=T)
  plot(desc.pc, main="Scree Plot", col="gray30")
  plot(desc.pc$x[,1], desc.pc$x[,2], xlab="PC1", ylab="PC1", main="Projection")
  biplot(desc.pc, main="Biplot")
  
}

setMethod("plot", "Descriptors", function(x, y, ...) plot.Descriptors(x, y, ...))

rbind.Sequences <- function(seq1, seq2) {
  seqs <- new("Sequences", rbind(seq1@.Data, seq2@.Data), alphabet=unique(c(seq1@alphabet, seq2@alphabet)))
  return(seqs)
}
setGeneric("rbind")
setMethod("rbind", "Sequences", function(...,deparse.level=1) rbind.Sequences(...))

decoys <- function(seqs, n=nrow(seqs)){
  choices <- seqs@alphabet
  decoys <- t(sapply(1:n, FUN=function(x) {sample(x=choices, size=ncol(seqs), replace=T)}))
  decoyNames <- apply(decoys, MARGIN=1, FUN=function(x) {paste(seqs@alphabet[x], collapse="")})
  rownames(decoys) <- decoyNames
  

  return(new("Sequences", decoys, alphabet=seqs@alphabet))
}

#This function will give the FP and FN values. Classes should be
#positive for being in the motif and negative for not.
residuals.MotifModel <- function(model, seqs=model@seqs, classes=rep(0,nrow(seqs)), threshold = 10**-3) {
  
  predictions <-predict(model, seqs)
  
  result <- list(FP=0, FN=0, accuracy=0)

  for(i in 1:length(predictions)) {
    if(predictions[i] > threshold && classes[i] < 0) result$FP <- result$FP + 1
    if(predictions[i] < threshold && classes[i] >= 0) result$FN <- result$FN + 1
  }

  if(sum(classes < 0 ) == 0) {
    result$accuracy = 1 - result$FN  / length(classes)
  }
  else {
  result$accuracy=sqrt((sum(classes >= 0) - result$FN) * (sum(classes < 0) - result$FP) / (sum(classes >= 0) * sum(classes < 0)))
   }
  
  return(result)
  
}

residuals.MotifModelSet <- function(model, seqs=model@seqs, classes=rep(0, nrow(seqs)), threshold = 0) {
  
  classes.hat <- classify(model, seqs, threshold)
  result <- list(FP=0, FN=0, ACCM=0)

  for(i in 1:length(classes.hat)) {
    if(classes.hat[i] >= 0 && classes[i] < 0) result$FP <- result$FP + 1
    if(classes.hat[i] < 0 && classes[i] >= 0) result$FN <- result$FN + 1
  }


  if(sum(classes < 0 ) == 0) {
    result$accuracy = 1 - result$FN  / length(classes)
  }
  else {
    result$accuracy=sqrt((sum(classes >= 0) - result$FN) * (sum(classes < 0) - result$FP) / (sum(classes >= 0) * sum(classes < 0)))
  }
  
  return(result)  
  
}

setGeneric("residuals")
setMethod("residuals", "MotifModel", function(object, ...) residuals.MotifModel(object, ...))
setMethod("residuals", "MotifModelSet", function(object,...) residuals.MotifModelSet(object,...))


FiveTwoCV.Sequences <- function(seqs, classes, motifNumber=1, motifModelType="fixed", width=3, verbose=F,...) {


  result <- list(FP=0, FN=0, accuracy=0)

  pclasses <- classes >= 0
  pseqs.data <- seqs@.Data[pclasses,]
  nclasses <- classes < 0
  nseqs.data <- seqs@.Data[nclasses,]
  
  
  for(i in 1:5) {

    train.p <- sample(nrow(pseqs.data), size=nrow(pseqs.data) / 2, replace=F)
    train.n <- sample(nrow(nseqs.data), size=nrow(nseqs.data) / 2, replace=F)
    validate.p <- setdiff(1:nrow(pseqs.data), train.p)
    validate.n <- setdiff(1:nrow(nseqs.data), train.n)

    tseqs <- new("Sequences", rbind(pseqs.data[train.p,], nseqs.data[train.n,]), alphabet=seqs@alphabet)
    vseqs <- new("Sequences", rbind(pseqs.data[validate.p,],nseqs.data[validate.n,]), alphabet=seqs@alphabet)
    
    mdl <- motifModelSet(tseqs, motifNumber, motifModelType, width, verbose=verbose)
    temp <- residuals(mdl, vseqs, classes=c(rep(1, length(validate.p)), rep(-1, length(validate.n))))
    result$FP <- temp$FP + result$FP
    result$FN <- temp$FN + result$FN
    result$accuracy <- temp$accuracy + result$accuracy

    mdl <- motifModelSet(vseqs, motifNumber, motifModelType, width, verbose=verbose)
    temp <- residuals(mdl, tseqs, classes=c(rep(1, length(train.p)), rep(-1, length(train.n))))
    result$FP <- temp$FP + result$FP
    result$FN <- temp$FN + result$FN
    result$accuracy <- temp$accuracy + result$accuracy
    
  }

  result$FP <- result$FP / (5 * nrow(nseqs.data))
  result$FN <- result$FN / (5 * nrow(pseqs.data))
  result$accuracy <- result$accuracy / 10
  
  return(result)
  
}

FiveTwoCV.Descriptors <- function(desc, resp=desc@response, modelFxn) {
  return(FiveTwoCV(desc@.data, resp, modelFxn))
}

FiveTwoCV.default <- function(data, resp, modelFxn) {

  
   rows <- 1:nrow(data)
   SMR <- 0
   SVR <- 0
  for(i in 1:5) {

    train <- sample(nrow(data), size=nrow(data)/2, replace=F)
    validate <- setdiff(rows,train)

    mdl <- modelFxn(formula=resp~., data=cbind(resp=resp[train],data[train,]))
    validate.yhat <- predict(object=mdl, newdata=data[validate,])
    SVR <- SVR + sum((resp[validate] - validate.yhat)^2)
    SMR <- SMR + sum((mean(resp[train]) - resp[validate])^2)
    
    mdl <- modelFxn(resp~., data=cbind(resp=resp[validate],data[validate,]))
    train.yhat <- predict(object=mdl, newdata=data[train,])
    SVR <- SVR + sum((resp[train] - train.yhat)^2)
    SMR <- SMR + sum((mean(resp[validate]) - resp[train])^2)
  }

  return(1 - SVR/SMR)
}


setGeneric("FiveTwoCV", def=function(data,...) standardGeneric("FiveTwoCV"), useAsDefault=function(data, ...) FiveTwoCV.default(data,...))
setMethod("FiveTwoCV", signature="Sequences", function(data, classes, ...) FiveTwoCV.Sequences(data, classes, ...))
setMethod("FiveTwoCV", signature="Descriptors", function(data, ...) FiveTwoCV.Descriptors(data, ...))


modelStep <- function(desc, modelFxn, fitness=FiveTwoCV.default, resp=desc@response, cols.start=NA, max=(nrow(desc) - 6) / 3, hypervalidateNum=5, verbose=TRUE){

  if(is.na(cols.start)) {
    cols.start <- colnames(desc)[order(abs(cor(desc, resp)), decreasing=T)[1]]
  }

  if(verbose) {
    cat(paste("Performing stepwise regression starting at", cols.start, "and including up to", max,"descriptors\n"))
  }
  
  desc.hyperval.index <- sample(nrow(desc),size=hypervalidateNum)
  dsub <- desc[-desc.hyperval.index,]

  f.last <- 0
  f.cur <-  0

  h.last <- 0
  h.cur <-  0

  cols <- cols.start

  while(f.cur >= f.last && length(cols) < max && h.cur >= h.last * 0.75) {
    f.last <- f.cur
    h.last <- h.cur

    #attempt to add each column and check the fitness
    cols.toadd <- NULL

    for(i in 1:(ncol(desc))) {
      cols.prop <- colnames(desc[i])

      if(sum(cols == cols.prop)== 0) {
        f.temp <- fitness(dsub[,c(cols,cols.prop)], resp[-desc.hyperval.index], modelFxn)
	if(!(is.na(f.temp) || is.null(f.temp)) && (f.temp > f.cur || f.cur == 0)){
	  f.cur <- f.temp
	  cols.toadd <- cols.prop
        }
      }
    }

    cols <- c(cols,cols.toadd)

    df <- data.frame(resp=resp[-desc.hyperval.index],dsub[,cols])

    colnames(df) <- c("resp", cols)

    mdl <- modelFxn(resp~., data=df)
    yhat <- predict(mdl,desc[desc.hyperval.index,])
    SVR <- sum((resp[desc.hyperval.index] - yhat)^2)
    SMR <- sum((mean(resp[-desc.hyperval.index]) - resp[desc.hyperval.index])^2)    
    h.cur <- SVR/SMR

    if(verbose) {
      cat(paste("Current descriptors:",paste(cols, collapse=" "), "\n"))
      cat(paste("Fitness:", f.cur, "\n"))
    }
  }

    return(cols)

}

ROC <- function(model, seqs, classes, thresholdRange=NA) {

  npos <- sum(classes >= 0)
  nneg <- sum(classes < 0)
  
  result <- data.frame(FP=0:nneg, FN=rep(NA,nneg+1))

  if(is.na(thresholdRange)) {
    preds <- predict(model, seqs)
    thresholdRange <- seq(min(preds), max(preds), (max(preds) - min(preds)) / 100)
  }
  
  print("Building ROC curve...")
  for(i in 1:length(thresholdRange)) {
    cat(paste("\r", format(100 * i/length(thresholdRange), width=4, digits=3), "%"))
    rs <- residuals(model, seqs, classes, threshold=thresholdRange[i])
    fp <- rs$FP
    fn <- rs$FN
    if(is.na(result[result[,1] == fp,2]) ||  result[result[,1] == fp,2] > fn) {
      result[result[,1] == fp, 2] = fn
    }
  }

  result <- result[!is.na(result[,2]),]

  result[,1] <- result[,1] / nneg
  result[,2] <- 1 - result[,2] / npos

  plot(result, xlab="False Positive Rate", ylab="True Positive Rate", type="l", col="red", xlim=c(0,1), ylim=c(0,1))
  lines(seq(0,1,0.1), seq(0,1,0.1), lty=2)
  
  return(result)
}

Try the peplib package in your browser

Any scripts or data that you put into this service are public.

peplib documentation built on May 29, 2017, 10:52 p.m.