Nothing
#SUPER IMPORTANT NOTE: These methods are fairly general, but they assume the last character in the alphabet is a gap character.
setClass("Sequences", representation(alphabet="character", nseqs="numeric"), contains="matrix")
setClass("Descriptors", representation(response="numeric", pvalues="numeric"), contains="data.frame")
setClass("MetricParams", representation(smatrix="matrix", gapOpen="numeric", gapExtension="numeric"))
setClass("MotifModel", representation(mmodel="matrix", bmodel="numeric", width="integer", seqs="Sequences", np="integer"))
setClass("MotifModelSet", representation(motifs="list"))
setClass("OOPS", representation(zmatrix="matrix"), contains="MotifModel")
setClass("SSOOPS", representation(zvector="numeric"), contains="MotifModel")
setClass("ZOOPS", representation(zmatrix="matrix", gamma="numeric", qarray="numeric"), contains="MotifModel")
bs85 <- matrix(c(5, -2, -2, -2, -1, -1, -1, 0, -2, -2, -2, -1, -2, -3, -1, 1, 0, -3, -3, -1, -2, -1, -1, -6,
-2, 6, -1, -2, -4, 1, -1, -3, 0, -4, -3, 2, -2, -4, -2, -1, -2, -4, -3, -3, -2, 0, -2, -6,
-2, -1, 7, 1, -4, 0, -1, -1, 0, -4, -4, 0, -3, -4, -3, 0, 0, -5, -3, -4, 4, -1, -2, -6,
-2, -2, 1, 7, -5, -1, 1, -2, -2, -5, -5, -1, -4, -4, -2, -1, -2, -6, -4, -4, 4, 1, -2, -6,
-1, -4, -4, -5, 9, -4, -5, -4, -5, -2, -2, -4, -2, -3, -4, -2, -2, -4, -3, -1, -4, -5, -3, -6,
-1, 1, 0, -1, -4, 6, 2, -3, 1, -4, -3, 1, 0, -4, -2, -1, -1, -3, -2, -3, -1, 4, -1, -6,
-1, -1, -1, 1, -5, 2, 6, -3, -1, -4, -4, 0, -3, -4, -2, -1, -1, -4, -4, -3, 0, 4, -1, -6,
0, -3, -1, -2, -4, -3, -3, 6, -3, -5, -5, -2, -4, -4, -3, -1, -2, -4, -5, -4, -1, -3, -2, -6,
-2, 0, 0, -2, -5, 1, -1, -3, 8, -4, -3, -1, -3, -2, -3, -1, -2, -3, 2, -4, -1, 0, -2, -6,
-2, -4, -4, -5, -2, -4, -4, -5, -4, 5, 1, -3, 1, -1, -4, -3, -1, -3, -2, 3, -5, -4, -2, -6,
-2, -3, -4, -5, -2, -3, -4, -5, -3, 1, 4, -3, 2, 0, -4, -3, -2, -3, -2, 0, -5, -4, -2, -6,
-1, 2, 0, -1, -4, 1, 0, -2, -1, -3, -3, 6, -2, -4, -2, -1, -1, -5, -3, -3, -1, 1, -1, -6,
-2, -2, -3, -4, -2, 0, -3, -4, -3, 1, 2, -2, 7, -1, -3, -2, -1, -2, -2, 0, -4, -2, -1, -6,
-3, -4, -4, -4, -3, -4, -4, -4, -2, -1, 0, -4, -1, 7, -4, -3, -3, 0, 3, -1, -4, -4, -2, -6,
-1, -2, -3, -2, -4, -2, -2, -3, -3, -4, -4, -2, -3, -4, 8, -1, -2, -5, -4, -3, -3, -2, -2, -6,
1, -1, 0, -1, -2, -1, -1, -1, -1, -3, -3, -1, -2, -3, -1, 5, 1, -4, -2, -2, 0, -1, -1, -6,
0, -2, 0, -2, -2, -1, -1, -2, -2, -1, -2, -1, -1, -3, -2, 1, 5, -4, -2, 0, -1, -1, -1, -6,
-3, -4, -5, -6, -4, -3, -4, -4, -3, -3, -3, -5, -2, 0, -5, -4, -4, 11, 2, -3, -5, -4, -3, -6,
-3, -3, -3, -4, -3, -2, -4, -5, 2, -2, -2, -3, -2, 3, -4, -2, -2, 2, 7, -2, -4, -3, -2, -6,
-1, -3, -4, -4, -1, -3, -3, -4, -4, 3, 0, -3, 0, -1, -3, -2, 0, -3, -2, 5, -4, -3, -1, -6,
-2, -2, 4, 4, -4, -1, 0, -1, -1, -5, -5, -1, -4, -4, -3, 0, -1, -5, -4, -4, 4, 0, -2, -6,
-1, 0, -1, 1, -5, 4, 4, -3, 0, -4, -4, 1, -2, -4, -2, -1, -1, -4, -3, -3, 0, 4, -1, -6,
-1, -2, -2, -2, -3, -1, -1, -2, -2, -2, -2, -1, -1, -2, -2, -1, -1, -3, -2, -1, -2, -1, -2, -6,
-6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, -6, 1),
nrow=24, ncol=24)
aabet <- c("A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V", "B", "Z", "X", "-")
colnames(bs85) <- aabet
rownames(bs85) <- aabet
default.MetricParams <- new("MetricParams", smatrix=bs85, gapOpen=-10, gapExtension=-0.2)
sdist <- function(s1, s2, params=default.MetricParams) {
distance <- rep(NA, length(s1))
i1 = "-"
i2 = "-"
for(i in 1:length(s1)) {
#deal with differeing lengths
if(s1[i] == "-") {
i1 = "-"
} else {
i1 = s1[i]
}
if(i > length(s2) || s2[i] == "-") {
i2 = "-"
} else {
i2 = s2[i]
}
#gaps
if(i1 == "-" && i2 != "-") {
if(i > 1 && s1[i - 1] == "-") {
distance[i] = params@gapExtension
}
else {
distance[i] = params@gapOpen
}
}
else if(i1 != "-" && i2 == "-") {
if(i > 1 && s2[i - 1] == "-") {
distance[i] = params@gapExtension
}
else{
distance[i] = params@gapOpen
}
}
else {
distance[i] = params@smatrix[i1,i2]
}
}
s = sum(distance)
return(s)
}
sHammingDist <- function(s1, s2, params) {
distance <- sum(sapply(1:min(length(s1),length(s2)), FUN=function(x) {as.double(s1[x] != s2[x])}))
distance <- distance + abs(length(s1) - length(s2))
return(as.double(distance))
}
dist.Sequences <- function(seqs, method="substitution", params=default.MetricParams, ...) {
seqs <- seqs@.Data
if(method == "substitution" || method=="euclidian") {
dist <- sdist
}
if(method == "hamming") {
dist <- sHammingDist
}
dvec <- vector(mode="numeric", length=nrow(seqs) * (nrow(seqs) - 1) / 2)
for(i in 1:(nrow(seqs) - 1)){
for(j in (i + 1):nrow(seqs)) {
dvec[nrow(seqs) * (i - 1) - i * (i - 1) / 2 + j - i] <- dist(seqs[i,], seqs[j,], params)
}
}
#Make it a dissimilarity matrix
dvec <- max(dvec) - dvec
attr(dvec, "Size") <- nrow(seqs)
attr(dvec, "Labels") <- rownames(seqs)
attr(dvec, "Diag") <- FALSE
attr(dvec, "Upper") <- FALSE
attr(dvec, "method") <- method
attr(dvec, "call") <- match.call()
class(dvec) <- "dist"
return(dvec)
}
setGeneric("dist")
setMethod("dist", "Sequences", function(x, method="euclidian", diag=F, upper=F, p=2) dist.Sequences(x, method))
setMethod("[", signature=c("Sequences"), definition=function(x, i, j, ..., drop) {
if(missing(j)) {
new("Sequences", x@.Data[i,], alphabet=x@alphabet, nseqs=length(i))
} else if(missing(i)) {
new("Sequences", x@.Data[,j], alphabet=x@alphabet, nseqs=0)
} else {
new("Sequences", x@.Data[i,j], alphabet=x@alphabet, nseqs=0)
}
} )
wdist <- function(seqmatrix, sweights=NULL, dist=sdist, params=default.MetricParams) {
if(is.null(sweights)){
sweights <- createSWeights(seqmatrix)
}
dmatrix <- sdist(seqmatrix, dist, params)
dmatrix <- dmatrix * (sweights %*% t(sweights))
return(dmatrix)
}
explode <- function(string, max, alphabet) {
result <- strsplit(string, split="")[[1]]
while(length(result) < max) {
result <- c(result,"-")
}
result <- sapply(result, FUN=function(x){if(!(x %in% alphabet)) {"B"} else {x}})
return(result)
}
processError <- function(e, mymessage) {
cat(paste(mymessage, "\n"))
}
read.sequences <- function(file, header = FALSE, sep = "", quote="\"", dec=".",
fill = FALSE, comment.char="", alphabet=aabet) {
#assumes that the first column is the sequence information
#Try loading the file
tryCatch(data <- read.table(file, header=header, sep=sep, quote=quote, dec=dec, fill=fill, comment.char=comment.char), error=function(e) {processError(e, "Could not read file")})
#Ok, continue to spread out sequences
t <- tryCatch(data <- toupper(as.character(data[,1])), error=function(e) {processError(e, "Non-letters in sequence file")})
if(inherits(t, "try-error")) return
nseqs <- length(unique(data))
t <- tryCatch(maxlength <- max(sapply(data, FUN=function(x) {length(unlist(strsplit(x, split="")))})), error=function(e) {processError(e, "Failed to split sequence into characters")})
if(inherits(t, "try-error")) return
t <- tryCatch( seqmatrix <- matrix(sapply(data, FUN=function(x) {explode(x, maxlength, alphabet)}), nrow=length(data), byrow=TRUE), error=function(e) {processError(e, "Failed to process non-cannonical amino acids")})
if(inherits(t, "try-error")) return
t <- tryCatch(seqmatrix <- apply(seqmatrix, MARGIN=2, FUN=function(x) {sapply(x, FUN=function(y) {which(alphabet == as.character(y))})}), error=function(e) {processError(e, "Failed to convert to numerical representation")})
if(inherits(t, "try-error")) return
rnames <- apply(seqmatrix, MARGIN=1, FUN=function(x) {paste(alphabet[x], collapse="")})
for(i in 1:(length(rnames) - 1)) {
if(rnames[i] %in% rnames[(i + 1):length(rnames)]) {
rnames[i] <- paste(rnames[i], ".", sum(rnames[i] == rnames[(i + 1):length(rnames)]), sep="")
}
}
rownames(seqmatrix) <- rnames
seqs <- new("Sequences", seqmatrix, alphabet=alphabet, nseqs=nseqs)
return(seqs)
}
read.fasta <- function(file, header = FALSE, sep = "", quote="\"", dec=".",
fill = FALSE, alphabet=aabet) {
return(read.sequences(file, header, sep, quote, dec, fill, comment.char=">", alphabet))
}
#this will allow the writing of sequences to a file, utilizing R's
#built in write.table method If you pass a motifModel along with the
#sequences, then it will align and output the motifs from the
#sequences.
write.sequences <- function(seqs, motifModel = NULL, file = "", append = FALSE) {
if(!is.null(motifModel)) {
outSeqs <- matrix(0, ncol=motifModel@width, nrow=nrow(seqs))
if(class(motifModel) == "SSOOPS") {
startPos <- which.max(motifModel@zvector)
outSeqs <- seqs[, startPos:(startPos + motifModel@width - 1)]
} else {
for(i in 1:nrow(seqs)) {
startPos <- which.max(motifModel@zmatrix[i, ])
outSeqs[i,] <- seqs[i, startPos:(startPos + motifModel@width)]
}
}
seqs <- outSeqs
}
output <- apply(seqs, MARGIN=1, FUN=function(x) {paste(seqs@alphabet[x], collapse="")})
write.table(output, file=file, append=append, row.names=FALSE, col.names=FALSE)
}
write.fasta <- function(seqs, motifModel = NULL, file = "", eol = "\n") {
if(!is.null(motifModel)) {
outSeqs <- matrix(0, ncol=motifModel@width, nrow=nrow(seqs))
if(class(motifModel) == "SSOOPS") {
startPos <- which.max(motifModel@zvector)
outSeqs <- seqs[, startPos:(startPos + motifModel@width - 1)]
} else {
for(i in 1:nrow(seqs)) {
startPos <- which.max(motifModel@zmatrix[i, ])
outSeqs[i,] <- seqs[i, startPos:(startPos + motifModel@width)]
}
}
seqs <- outSeqs
}
output <- apply(seqs, MARGIN=1, FUN=function(x) {paste(seqs@alphabet[x], collapse="")})
for(i in 1:length(output)) {
write(paste("> Sequence", i), file=file, append=T)
write(paste(output[i]), file=file, append=T)
}
}
plot.Sequences <- function(seqs, clusterNumber=3, params=default.MetricParams, distanceMatrix=dist.Sequences(seqs, params=params), clusters=aclust(distanceMatrix, clusterNumber), legendText=c(), main="") {
#This makes it so that the coloring follows the size of the clusters. Makes plots reproducible since the
#cluster indices from kmeans are not reproducible.
idmap <- order(as.integer(lapply(clusters, length)))
colors <- rep("black", nrow(seqs))
shades <- hcl(h=1:clusterNumber * (360 / clusterNumber), c=90, l=70 )
for(i in 1:length(clusters)) {
colors[clusters[[idmap[i]]]] <- shades[i]
}
# fit <- cmdscale(distanceMatrix, eig=T, k=2)
fit <- prcomp(distanceMatrix)
x <- fit$x[,1]
y <- fit$x[,2]
plot(x,y,xlab="Component 1", ylab="Component 2", col=colors, main=main)
if(length(legendText) > 0) {
legend(max(x), max(y) * 1.25, legendText, col=shades,text.col="black", pch=rep(1, length(legendText)), xpd=TRUE)
}
}
setGeneric("plot")
setMethod("plot", "Sequences", function(x,y,...) plot.Sequences(x, ...))
createSWeights <- function(seqmatrix, params=default.MetricParams) {
wmatrix <- matrix(rep(0,nrow(params@smatrix) * ncol(seqmatrix)), nrow=nrow(params@smatrix))
rownames(wmatrix) <- rownames(params@smatrix)
tables <- apply(seqmatrix, MARGIN=2, FUN=function(x) table(x))
for(i in 1:ncol(wmatrix)) {
for(j in 1:length(tables[[i]])) {
wmatrix[names(tables[[i]])[j], i] <- 1. / (length(tables[[i]]) * tables[[i]][j])
}
}
sweights <- rep(0,nrow(seqmatrix))
for(i in 1:nrow(seqmatrix)) {
for(j in 1:ncol(seqmatrix)) {
sweights[i] <- sweights[i] + wmatrix[seqmatrix[i,j], j]
}
}
return(sweights)
}
motifModelSet <- function(seqs, motifNumber=NA, type="fixed", width=4, verbose=T, clusterType="kmeans", maxGuess=8, plotMain="") {
#can't have a motif wider than the sequences
if(width > ncol(seqs)) {
width <- ncol(seqs)
}
#check if the number of sequences is accurate
if(seqs@nseqs == 0) {
seqs@nseqs <- length(unique(apply(seqs@.Data, MARGIN=1, FUN=function(x) {paste(x,collapse="")})))
}
if(is.na(motifNumber)) {
cat("Guessing cluster number, this could take a while...\n")
#can't have more clusters than unique sequences
if(maxGuess > seqs@nseqs) {
maxGuess <- seqs@nseqs
}
ll <- data.frame(clusterN=1:maxGuess, logLik=rep(0, maxGuess))
for(i in 1:maxGuess) {
cat(paste("\r", i, "/", maxGuess))
ll$logLik[i] <- logLik(motifModelSet(seqs, i, type, width, verbose, clusterType))
}
cat("\n")
plot(ll, xlab="Cluster Number", col="black", main=plotMain)
lines(ll, col="black", lty=1)
motifNumber <- ll$clusterN[which.min(ll$logLik)]
}else if(motifNumber == 1) {
return(new("MotifModelSet", motifs=list(motifModel(seqs, type, width))))
}
if(motifNumber > seqs@nseqs) {
cat(paste("Warning: must have more unique sequences than motifs. Lowering motif number to", seqs@nseqs))
motifNumber <- seqs@nseqs
}
clusters <- aclust(dist(seqs), clusterNumber=motifNumber, verbose=verbose, type=clusterType)
seqList <- vector(mode="list", length=motifNumber)
for(i in 1:motifNumber) {
seqList[[i]] <- new("Sequences", seqs[clusters[[i]],], alphabet=seqs@alphabet)
}
result <- lapply(seqList, FUN=function(x) { motifModel(x, type, width) })
mset <- new("MotifModelSet", motifs=result)
return(mset)
}
motifModel <- function(seqs, type="fixed", width=4) {
if(width > ncol(seqs)) {
width <- ncol(seqs)
}
#check if the number of sequences is accurate
if(seqs@nseqs == 0) {
seqs@nseqs <- length(unique(apply(seqs@.Data, MARGIN=1, FUN=function(x) {paste(x,collapse="")})))
}
if(class(seqs) == "matrix") {
seqs <- new("Sequences", seqs, alphabet=aabet, nseqs=seqs@nseqs)
}
if(type == "fixed") {
model <- factory.SSOOPS(seqs, width=width)
}
if(type == "variable") {
model <- factory.OOPS(seqs, width=width)
}
if(type == "optional") {
model <- factory.ZOOPS(seqs, width=width)
}
convergence <- 10
while(convergence > 10**-3) {
temp <- EMStep(model,seqs)
convergence <- sum((model@mmodel - temp@mmodel)**2 / (model@mmodel)**2)
model <- temp
}
return(model)
}
predict.MotifModel <- function(object, newSequences=object@seqs, ...) {
logodds <- matrix(object@mmodel, ncol=object@width)
for(i in 1:nrow(logodds)) {
logodds[i,] <- log(logodds[i,] / object@bmodel[i])
}
result <- rep(NA, nrow(newSequences))
for(i in 1:length(result)) {
max <- -10**200
for(j in 1:(ncol(newSequences) - object@width)) {
lsum <- 0
if((ncol(newSequences) - object@width) == 0) {
for(k in 1:ncol(newSequences)) {
if(newSequences[i,k] < length(newSequences@alphabet)) {
lsum <- lsum + logodds[newSequences[i,k], k]
}
else {
lsum <- max - 1
break
}
}
}
else {
for(k in j:(j + object@width - 1)) {
if(newSequences[i,k] < length(newSequences@alphabet)) {
lsum <- lsum + logodds[newSequences[i,k],k - j + 1]
}
else {
lsum <- max - 1
break
}
}
}
if(lsum > max) {
max <- lsum
}
}
result[i] <- max
}
return(result)
}
predict.MotifModelSet <- function(model, newSequences) {
result <- rep(NA, nrow(newSequences))
prds <- matrix(rep(NA, length(model@motifs) * nrow(newSequences)), nrow=nrow(newSequences))
for(i in 1:length(model@motifs)) {
prds[,i] <- predict(model@motifs[[i]], newSequences)
}
for(i in 1:nrow(newSequences)) {
result[i] <- max(prds[i,])
}
return(result)
}
classify.MotifModelSet <- function(motifSet, newSequences, threshold = 0) {
predictions <- lapply(motifSet@motifs, FUN=function(x) {predict(x, newSequences)})
pmatrix <- matrix(rep(NA, nrow(newSequences) * length(motifSet@motifs)), nrow=nrow(newSequences))
for(i in 1:length(motifSet@motifs)){
pmatrix[,i] <- predictions[[i]]
}
classes <- apply(pmatrix, MARGIN=1, FUN=function(x) {
if(is.na(threshold) || max(x) > threshold) which.max(x)
else -1
})
return(classes)
}
setGeneric("classify", function(model, ...) standardGeneric("classify"))
setMethod("classify", "MotifModelSet", function(model,...) classify.MotifModelSet(model,...))
setGeneric("predict")
setMethod("predict", "MotifModel", function(object,...) predict.MotifModel(object,... ))
setMethod("predict", "MotifModelSet", function(object,...) predict.MotifModelSet(object,... ))
EM.SSOOPS.Linked <- function(fullSeqMatrix, seqids, models) {
bcounts <- rep(1, length(models[[1]]@bmodel))
bsum <- 0
for(mindex in 1:length(models)) {
model <- models[[mindex]]
seqmatrix <- fullSeqMatrix[seqids[[mindex]],]@.Data
m <- (ncol(seqmatrix) - model@width + 1)
n <- nrow(seqmatrix)
ztrial <- rep(1., m)
mcounts <- matrix(rep(0,length(model@mmodel)), dim(model@mmodel))
names(bcounts) <- aabet
rownames(mcounts) <- aabet
msums <- rep(0, ncol(mcounts))
psum <- 0
for(i in 1:n) {
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width) {
ztrial[j] <- ztrial[j] * model@mmodel[seqmatrix[i,k],k - j + 1]
}
else {
ztrial[j] <- ztrial[j] * model@bmodel[ seqmatrix[i,k] ]
}
}
}
if(sum(ztrial) > 0.) {
ztrial <- ztrial / sum(ztrial)
}
}
for(i in 1:n) {
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width) {
mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] + ztrial[j]
msums[k - j + 1] <- msums[k - j + 1] + ztrial[j]
} else {
bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[j]
bsum <- bsum + 1. - ztrial[j]
}
}
}
}
for(i in 1:nrow(model@mmodel)) {
for(j in 1:ncol(model@mmodel)) {
models[[mindex]]@mmodel[i, j] <- mcounts[i,j] / msums[j]
}
}
models[[mindex]]@zvector <- ztrial
}
#I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
# for(i in 1:length(models)) {
# for(j in 1:length(models[[i]]@bmodel)) {
# models[[i]]@bmodel[j] <- bcounts[j] / bsum
# }
# }
return(models)
}
EMStep.SSOOPS <- function(model, seqs) {
alphabet <- seqs@alphabet
seqmatrix <- seqs@.Data
m <- (ncol(seqmatrix) - model@width + 1)
n <- nrow(seqmatrix)
pseudocount <- length(alphabet)
if(pseudocount > nrow(seqmatrix)) {
pseudocount <- nrow(seqmatrix)
}
ztrial <- rep(1., m)
bcounts <- rep(pseudocount / length(alphabet),length(model@bmodel))
mcounts <- matrix(rep(pseudocount / length(alphabet),length(model@mmodel)), dim(model@mmodel))
names(bcounts) <- alphabet
rownames(mcounts) <- alphabet
bsum <- pseudocount
msums <- rep(pseudocount, ncol(mcounts))
psum <- 0
for(i in 1:n) {
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width && seqmatrix[i,k] <= nrow(mcounts)) {
ztrial[j] <- ztrial[j] * model@mmodel[seqmatrix[i,k],k - j + 1]
}
else {
ztrial[j] <- ztrial[j] * model@bmodel[ seqmatrix[i,k] ]
}
}
}
ztrial <- ztrial / sum(ztrial)
}
for(i in 1:n) {
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width) {
if(seqmatrix[i,k] <= nrow(mcounts)) {
mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] + ztrial[j]
msums[k - j + 1] <- msums[k - j + 1] + ztrial[j]
}
} else {
bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[j]
bsum <- bsum + 1. - ztrial[j]
}
}
}
}
#I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
# for(i in 1:length(model@bmodel)) {
# model@bmodel[i] <- bcounts[i] / bsum
# }
for(i in 1:nrow(model@mmodel)) {
for(j in 1:ncol(model@mmodel)) {
model@mmodel[i, j] <- mcounts[i,j] / msums[j]
}
}
model@zvector <- ztrial
return(model)
}
EMStep.OOPS <- function(model, seqs) {
alphabet <- seqs@alphabet
seqmatrix <- seqs@.Data
m <- (ncol(seqmatrix) - model@width + 1)
n <- nrow(seqmatrix)
pseudocount <- length(alphabet)
if(pseudocount > nrow(seqmatrix)) {
pseudocount <- nrow(seqmatrix)
}
ztrial <- matrix(rep(1, m * n), nrow=n, ncol=m)
bcounts <- rep(pseudocount / length(alphabet),length(model@bmodel))
mcounts <- matrix(rep(pseudocount / length(alphabet), length(model@mmodel)), dim(model@mmodel))
names(bcounts) <- alphabet
rownames(mcounts) <- alphabet
bsum <- pseudocount
msums <- rep(pseudocount, ncol(mcounts))
for(i in 1:n) {
psum <- 0
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width && seqmatrix[i,k] <= nrow(mcounts)) {
ztrial[i,j] <- ztrial[i,j] * model@mmodel[seqmatrix[i,k],k - j + 1]
}
else {
ztrial[i,j] <- ztrial[i,j] * model@bmodel[ seqmatrix[i,k] ]
}
}
psum <- psum + ztrial[i,j]
}
ztrial[i,] <- ztrial[i,] / psum
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width) {
if(seqmatrix[i,k] <= nrow(mcounts)) {
mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] + ztrial[i,j]
msums[k - j + 1] <- msums[k - j + 1] + ztrial[i,j]
}
} else {
bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[i,j]
bsum <- bsum + 1. - ztrial[i,j]
}
}
}
}
#I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
# for(i in 1:length(model@bmodel)) {
# model@bmodel[i] <- bcounts[i] / bsum
# }
for(i in 1:nrow(model@mmodel)) {
for(j in 1:ncol(model@mmodel)) {
model@mmodel[i, j] <- mcounts[i,j] / msums[j]
}
}
model@zmatrix <- ztrial
return(model)
}
EMStep.ZOOPS <- function(model, seqs) {
alphabet <- seqs@alphabet
seqmatrix <- seqs@.Data
m <- (ncol(seqmatrix) - model@width + 1)
n <- nrow(seqmatrix)
pseudocount <- length(alphabet)
if(pseudocount > nrow(seqmatrix)) {
pseudocount <- nrow(seqmatrix)
}
ztrial <- matrix(rep(1, m * n), nrow=n, ncol=m)
bcounts <- rep(pseudocount / length(alphabet),length(model@bmodel))
mcounts <- matrix(rep(pseudocount / length(alphabet),length(model@mmodel)), dim(model@mmodel))
names(bcounts) <- alphabet
rownames(mcounts) <- alphabet
bsum <- pseudocount
msums <- rep(pseudocount, ncol(mcounts))
for(i in 1:n) {
psum = 0
noseq <- 1
#calcultate probability of no occurence of the motif
for(j in 1:ncol(seqmatrix)) {
noseq <- noseq * model@bmodel[ seqmatrix[i,j] ]
}
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width && seqmatrix[i,k] <= nrow(mcounts)) {
ztrial[i,j] <- ztrial[i,j] * model@mmodel[seqmatrix[i,k],k - j + 1]
}
else {
ztrial[i,j] <- ztrial[i,j] * model@bmodel[ seqmatrix[i,k] ]
}
}
psum <- psum + ztrial[i,j]
}
ztrial[i,] <- ztrial[i,] * model@gamma / (psum * model@gamma + noseq * m * (1 - model@gamma))
model@qarray[i] <- sum(ztrial[i,])
for(j in 1:m) {
for(k in 1:ncol(seqmatrix)) {
if(k >= j && k < j + model@width) {
if(seqmatrix[i,k] <= nrow(mcounts)) {
mcounts[seqmatrix[i,k],k - j + 1 ] <- mcounts[ seqmatrix[i,k], k - j + 1] + ztrial[i,j]
msums[k - j + 1] <- msums[k - j + 1] + ztrial[i,j]
}
} else {
bcounts[seqmatrix[i,k]] <- bcounts[seqmatrix[i,k]] + 1. - ztrial[i,j]
bsum <- bsum + 1. - ztrial[i,j]
}
}
}
}
#I don't think this is the correct background model, since each amino acid is represented equally in the peptide library.
# for(i in 1:length(model@bmodel)) {
# model@bmodel[i] <- bcounts[i] / bsum
# }
for(i in 1:nrow(model@mmodel)) {
for(j in 1:ncol(model@mmodel)) {
model@mmodel[i, j] <- mcounts[i,j] / msums[j]
}
}
model@zmatrix <- ztrial
model@gamma <- sum(model@qarray) / nrow(seqmatrix)
return(model)
}
setGeneric("EMStep", def = function(model, seqs,...) standardGeneric("EMStep"))
setMethod("EMStep", "OOPS", EMStep.OOPS)
setMethod("EMStep", "SSOOPS", EMStep.SSOOPS)
setMethod("EMStep", "ZOOPS", EMStep.ZOOPS)
logLik.OOPS <- function(model) {
logLik <- 0
for(i in 1:nrow(model@seqs)) {
pseq <- 0
for(j in 1:ncol(model@seqs)) {
if(j + model@width - 1 <= ncol(model@seqs)) {
pj <- 0
for(k in 1:model@width) {
pj <- pj + model@zmatrix[i,j] * log(model@mmodel[model@seqs@.Data[i,j + k - 1], k])
}
pseq <- pseq + pj
}#PUT BACKGROUND BACK IN
}
logLik <- logLik + pseq
}
names(logLik) <- NULL
return(logLik)
}
logLik.SSOOPS <- function(model) {
logLik <- 0
for(i in 1:nrow(model@seqs)) {
pseq <- 0
for(j in 1:ncol(model@seqs)) {
if(j + model@width - 1 <= ncol(model@seqs)) {
pj <- 0
for(k in 1:ncol(model@seqs)) {
if(k >= j && k < j + model@width) {
pj <- pj + model@zvector[j] * log(model@mmodel[model@seqs@.Data[i,k], k -j + 1])
}
else {
pj <- pj + model@zvector[j] * log(model@bmodel[model@seqs@.Data[i,k]])
}
}
pseq <- pseq + pj
}
}
logLik <- logLik + pseq
}
names(logLik) <- NULL
return(logLik)
}
logLik.ZOOPS <- function(model) {
logLik <- 0
for(i in 1:nrow(model@seqs)) {
pseq <- 0
#liklihood of sequence occuring
for(j in 1:ncol(model@seqs)) {
if(j + model@width - 1 <= ncol(model@seqs)) {
pj <- 0
for(k in 1:ncol(model@seqs)){
if(k >= j && k < j + model@width) {
pj <- pj + model@zmatrix[i,j] * log(model@mmodel[model@seqs@.Data[i,k], k - j + 1])
}
else {
pj <- pj + model@zmatrix[i,j] * log(model@bmodel[model@seqs@.Data[i,k]])
}
}
pseq <- pseq + pj
}
}
logLik <- logLik + pseq
#liklihood of peptide being background
for(j in 1:ncol(model@seqs)) {
logLik <- logLik + (1 - model@qarray[i]) * log(model@bmodel[model@seqs@.Data[i,k]])
}
}
names(logLik) <- NULL
return(logLik)
}
BIC.MotifModel <- function(object) {
motif <- object
logLik <- logLik(motif)
k <- k + motif@np
n <- n + motif@seqs
return(-2 * logLik + k * log(n))
}
BIC.MotifModelSet <- function(object) {
mset <- object
logLik <- 0
k <- 0 #Number of parameters
n <- 0 #sample size
for(i in 1:length(mset@motifs)) {
logLik <- logLik + logLik(mset@motifs[[i]])
k <- k + mset@motifs[[i]]@np
n <- n + nrow(mset@motifs[[i]]@seqs)
}
return(-2 * logLik + k * log(n))
}
logLik.MotifModelSet <- function(object) {
mset <- object
logLik <- 0
for(i in 1:length(mset@motifs)) {
logLik <- logLik + logLik(mset@motifs[[i]])
}
return(logLik)
}
setGeneric("logLik")
setMethod("logLik", "OOPS", function(object, ...) logLik.OOPS(object,...))
setMethod("logLik", "ZOOPS", function(object, ...) logLik.ZOOPS(object, ...))
setMethod("logLik", "SSOOPS", function(object, ...) logLik.SSOOPS(object, ...))
setMethod("logLik", "MotifModelSet", function(object, ...) logLik.MotifModelSet(object, ...))
setGeneric("BIC")
setMethod("BIC", "MotifModelSet", function(object) BIC.MotifModelSet(object))
setMethod("BIC", "MotifModel", function(object) BIC.MotifModel(object))
factory.OOPS <- function(seqs, width=ncol(seqs)) {
alphabet <- seqs@alphabet
model <- new("OOPS",
zmatrix=matrix(rep(c(1,rep(0,ncol(seqs) - width + 1)), nrow(seqs)), nrow=nrow(seqs)),
mmodel=matrix(rep(1.0/length(alphabet),length(alphabet) * width), ncol=width),
bmodel=rep(1.0/length(alphabet),length(alphabet)), width=as.integer(width),
seqs=seqs)
model@np <- length(model@zmatrix) + length(model@mmodel)
names(model@bmodel) <- alphabet
rownames(model@mmodel) <- alphabet
return(model)
}
factory.SSOOPS <- function(seqs, width=ncol(seqs)) {
alphabet <- seqs@alphabet
model <- new("SSOOPS",
zvector=rep(1./(ncol(seqs) - width + 1.), ncol(seqs) - width + 1),
mmodel=matrix(rep(1.0/length(alphabet),length(alphabet) * width), ncol=width),
bmodel=rep(1.0/length(alphabet),length(alphabet)),
width=as.integer(width),
seqs=seqs)
model@np <- length(model@zvector) + length(model@mmodel)
names(model@bmodel) <- alphabet
rownames(model@mmodel) <- alphabet
return(model)
}
factory.ZOOPS <- function(seqs, width=ncol(seqs)) {
alphabet <- seqs@alphabet
model <- new("ZOOPS",
zmatrix=matrix(rep(c(1,rep(0,ncol(seqs) - width + 1)), nrow(seqs)), nrow=nrow(seqs), byrow=T),
mmodel=matrix(rep(1.0/length(alphabet),length(alphabet) * width), ncol=width),
bmodel=rep(1.0/length(alphabet),length(alphabet)),
width=as.integer(width),
gamma=0.5,
qarray=rep(0.5,nrow(seqs)),
seqs=seqs)
model@np <- length(model@zmatrix) + length(model@mmodel) + length(model@qarray)
names(model@bmodel) <- alphabet
rownames(model@mmodel) <- alphabet
return(model)
}
print.Sequences <- function(seqs) {
}
print.MotifModel <- function(x,...) {
model <- x
#find where the motif begins
ncolz <- 0
if(class(model) == "SSOOPS") {
zsum <- model@zvector
ncolz <- length(model@zvector)
} else {
zsum <- apply(model@zmatrix, MARGIN=2, FUN=sum)
ncolz <- ncol(model@zmatrix)
}
start <- which.max(zsum)
mnum <- 4
cut <- 0.15
motifMode <- matrix(rep("", model@width * mnum), nrow=mnum)
for(i in 1:ncol(model@mmodel)) {
csort <- order(model@mmodel[,i], decreasing=T)
motifMode[1,i] <- rownames(model@mmodel)[csort[1]]
for(j in 2:mnum) {
if(model@mmodel[csort[j],i] > cut) {
motifMode[j,i] <- rownames(model@mmodel)[csort[j]]
}
}
}
motifModeStr <- apply(motifMode, MARGIN=2, FUN=function(x) {paste("[",paste(x,collapse=""),"]", sep="")})
seqMode <- c(rep("-", start - 1), motifModeStr, rep("-", ncolz - start ))
cat(paste("Number of sequences:", nrow(model@seqs)))
cat(paste("\nMotif Mode:", paste(seqMode, collapse="")))
cat("\n\nBackground:\n")
print(sort(model@bmodel, decreasing=T)[1:5])
}
MotifModel.motifString <- function(model) {
#Check for trivial case
if(nrow(model@seqs) == 1) {
return(model@seqs[1])
}
#find where the motif begins
ncolz <- 0
if(class(model) == "SSOOPS") {
zsum <- model@zvector
ncolz <- length(model@zvector)
} else {
zsum <- apply(model@zmatrix, MARGIN=2, FUN=sum)
ncolz <- ncol(model@zmatrix)
}
start <- which.max(zsum)
mnum <- 4
cut <- 0.15
motifMode <- matrix(rep("", model@width * mnum), nrow=mnum)
for(i in 1:ncol(model@mmodel)) {
csort <- order(model@mmodel[,i], decreasing=T)
motifMode[1,i] <- rownames(model@mmodel)[csort[1]]
for(j in 2:mnum) {
if(model@mmodel[csort[j],i] > cut) {
motifMode[j,i] <- rownames(model@mmodel)[csort[j]]
}
}
}
motifModeStr <- apply(motifMode, MARGIN=2, FUN=function(x) {paste("[",paste(x,collapse=""),"]", sep="")})
seqMode <- c(rep("-", start - 1), motifModeStr, rep("-", ncolz - start ))
return(paste(seqMode, collapse=""))
}
setGeneric("print")
setMethod("print", "MotifModel", function(x,...) print.MotifModel(x))
setGeneric("motifString", def = function(x,...) standardGeneric("motifString"))
setMethod("motifString", "MotifModel", function(x,...) MotifModel.motifString(x))
plot.MotifModelSet <- function(x,...) {
model <- x
clusterNumber <- length(model@motifs)
clusters <- vector("list", clusterNumber)
legendText <- vector("character", clusterNumber)
seqs.data <- matrix(0,nrow=sum(sapply(1:clusterNumber, function(x) nrow(model@motifs[[x]]@seqs))), ncol=ncol(model@motifs[[1]]@seqs))
counter <- 1
for(i in 1:clusterNumber) {
clusters[[i]] <- counter:(counter - 1 + nrow(model@motifs[[i]]@seqs))
counter <- counter + nrow(model@motifs[[i]]@seqs)
seqs.data[clusters[[i]],] <- model@motifs[[i]]@seqs@.Data
legendText[i] <- motifString(model@motifs[[i]])
}
seqs <- new("Sequences", seqs.data,alphabet=model@motifs[[1]]@seqs@alphabet)
colors <- rep("black", nrow(seqs))
shades <- hcl(h=1:clusterNumber * (360 / clusterNumber), c=90, l=70 )
for(i in 1:length(clusters)) {
colors[clusters[[i]]] <- shades[i]
}
fit <- prcomp(dist(seqs))
x <- fit$x[,1]
y <- fit$x[,2]
par(mar=c(5,4,2,5))
plot(x,y,xlab="Component 1", ylab="Component 2", col=colors)
legend(max(x) / 2, max(y) * 1.25, legendText, col=shades,text.col="black", pch=rep(1, clusterNumber), xpd=TRUE)
}
MotifModel.plotStartingPosition <- function(motifModel) {
par(fg="dark gray")
if(class(motifModel) == "SSOOPS") {
zsum <- motifModel@zvector
} else {
zsum <- apply(motifModel@zmatrix, MARGIN=2, FUN=sum)
}
barplot(zsum / sum(zsum), names.arg=as.character(1:length(zsum)), main="", col=hcl(h=1:length(zsum) * (360 / length(zsum))), border="black")
}
MotifModel.plotPositions <- function(motifModel) {
par(mfrow=c(ceiling(motifModel@width / 2),2), mar=c(3,3,2,2), cex=0.7)
for(i in 1:motifModel@width) {
barx <- barplot(motifModel@mmodel[,i], main=paste("Position", i), col=hcl(h=1:ncol(motifModel@seqs) * (360 / ncol(motifModel@seqs))), border=NA, xaxt="n", ylim=c(0,max(motifModel@mmodel[,i]) * 1.3))
#Find the highest three bars and label those
morder <- rev(order(motifModel@mmodel[,i]))[1:3]
text(barx[morder], motifModel@mmodel[morder,i], rownames(motifModel@mmodel)[morder],
pos=3, col="black")
}
par(mfrow=c(1,1))
}
MotifModel.plotFits <- function(motifModel) {
predictions <- sort(predict(motifModel), decreasing=T)
ncol <- 50
colsGood <- colorRampPalette(c("white", "green", "green"))(50)
colsBad <- colorRampPalette(c("red", "red", "white"))(50)
cols <- c()
if(sum(predictions > 0) > 0) {
cols <- c(colsGood[cut(predictions[predictions > 0], breaks=ncol, labels=F)])
}
if(sum(predictions < 0) > 0) {
cols <- c(cols, colsBad[cut(predictions[predictions < 0], breaks=ncol, labels=F)])
}
barplot(predictions, names.arg=NULL, main="", col=cols)
}
setGeneric("plotFits", function(motifModel) standardGeneric("plotFits"))
setMethod("plotFits", "MotifModel", MotifModel.plotFits)
setGeneric("plotStartingPosition", function(motifModel) standardGeneric("plotStartingPosition"))
setMethod("plotStartingPosition", "MotifModel", MotifModel.plotStartingPosition)
setGeneric("plotPositions", function(motifModel) standardGeneric("plotPositions"))
setMethod("plotPositions", "MotifModel", MotifModel.plotPositions)
setMethod("plot", "MotifModelSet", function(x, y, ...) plot.MotifModelSet(x,...))
plot.MotifModel <- function(x,...) {
if((class(x) == "SSOOPS" && x@width < length(x@zvector)) || (class(x) != "SSOOPS" && x@width < nrow(x@zmatrix))){
#plot motif start
if(class(x) == "SSOOPS") {
zsum <- x@zvector
} else {
zsum <- apply(x@zmatrix, MARGIN=2, FUN=sum)
}
barplot(zsum, names.arg=as.character(1:length(zsum)), main=paste("Motif Starting Position"), col="gray30")
}
par(mfrow=c(3,1), mar=c(3,3,2,2), ask=TRUE)
for(i in 1:x@width) {
barplot(x@mmodel[,i], main=paste("Motif Position", i), col="gray30")
}
par(mfrow=c(1,1))
predictions <- sort(predict(x), decreasing=T)
ncol <- 50
colsGood <- colorRampPalette(c("white", "green", "green"))(50)
colsBad <- colorRampPalette(c("red", "red", "white"))(50)
cols <- c()
if(sum(predictions > 0) > 0) {
cols <- c(colsGood[cut(predictions[predictions > 0], breaks=ncol, labels=F)])
}
if(sum(predictions < 0) > 0) {
cols <- c(cols, colsBad[cut(predictions[predictions < 0], breaks=ncol, labels=F)])
}
barplot(predictions, names.arg=NULL, main="Log Odds (Fit)", col=cols)
}
setMethod("plot", "MotifModel", function(x, y, ...) plot.MotifModel(x,...))
aclust <- function(sDistMatrix, clusterNumber, verbose=T, type="kmeans", knstart=20) {
if(class(sDistMatrix) == "Sequences") {
sDistMatrix <- dist(sDistMatrix)
}
if(type != "kmeans" && type != "agglomerative") {
stop("type must be either \"kmeans\" or \"agglomerative\"")
}
if(type == "kmeans") {
km <- kmeans(sDistMatrix, centers=clusterNumber, nstart=knstart)
result <- vector(mode="list", length=clusterNumber)
for(i in 1:clusterNumber) {
result[[i]] <- which(km$cluster == i)
}
return(result)
}
if(class(sDistMatrix) == "dist") {
N <- attr(sDistMatrix,"Size")
temp <- matrix(rep(NA, N ** 2), nrow=N, ncol=N)
for(i in 1:(nrow(temp) - 1)) {
for(j in (i + 1):ncol(temp)) {
temp[i,j] <- sDistMatrix[N * (i - 1) - i * (i - 1) / 2 + j - i]
temp[j,i] <- temp[i,j]
}
}
sDistMatrix <- temp
}
if(clusterNumber == 1) {
return(list(1:nrow(sDistMatrix)))
}
if(verbose) {
cat("Finding clusters...\n")
}
cn <- nrow(sDistMatrix)
#while cluster number is greater than clusterNumber
clusters <- as.list(1:nrow(sDistMatrix))
while(cn > clusterNumber && cn > 1) {
#Find minimum element in sDistMatrix
wmin <- findMinDistElement(sDistMatrix)
#find which cluster that element refers to
cA <- 0
cB <- 0
for(i in 1:length(clusters)) {
if(wmin[1] %in% clusters[[i]] || wmin[2] %in% clusters[[i]]) {
if(cA == 0) {
cA <- i
}else{
cB <- i
}
}
if(cA != 0 && cB != 0) {
break
}
}
#clobber rows/columns in sDistMatrix with minimum element
ca1 <- clusters[[cA]][1]
cb1 <- clusters[[cB]][1]
for(i in 1:ncol(sDistMatrix)) {
#is this a within cluster distance?
if(!(i %in% c(clusters[[cA]], clusters[[cB]]))) {
cmin <- cA
cmax <- cB
if(sDistMatrix[min(ca1,i), max(ca1,i)] < sDistMatrix[min(cb1,i), max(cb1,i)]) {
cmin <- cB
cmax <- cA
}
for(j in 1:length(clusters[[cmax]])) {
sDistMatrix[min(clusters[[cmax]][j], i), max(clusters[[cmax]][j], i)] <- sDistMatrix[min(clusters[[cmin]][1], i), max(clusters[[cmin]][1], i)]
}
}
}
#set within cluster distance to NA
for(i in 1:length(clusters[[cA]])) {
for(j in 1:length(clusters[[cB]])) {
if(clusters[[cA]][i] > clusters[[cB]][j]) {
sDistMatrix[clusters[[cB]][j],clusters[[cA]][i]] <- NA
} else {
sDistMatrix[clusters[[cA]][i],clusters[[cB]][j]] <- NA
}
}
}
#join two ID sets
clusters[[cA]] <- c(clusters[[cA]], clusters[[cB]])
clusters <- clusters[-cB]
cn <- length(clusters)
if(verbose) {
cat("\r")
cat(paste(cn, " "))
}
}
return(clusters)
}
changeClusterFormat <- function(clusterList) {
seqnumber <- 0
for(i in 1:length(clusterList)) {
seqnumber <- seqnumber + length(clusterList[[i]])
}
clusters <- rep(0,seqnumber)
for(i in 1:seqnumber) {
for(j in 1:length(clusterList)) {
if(i %in% clusterList[[j]]) {
clusters[i] = j
break
}
}
}
return(clusters)
}
findMinDistElement <- function(sDistMatrix) {
minimum <- NA
wmin <- c(0,0)
for(i in 1:(nrow(sDistMatrix) - 1)) {
for(j in (i + 1):ncol(sDistMatrix)) {
if(is.na(minimum) || (!is.na(sDistMatrix[i,j]) && sDistMatrix[i,j] < minimum)) {
wmin <- c(i,j)
minimum <- sDistMatrix[i,j]
}
}
}
return(wmin)
}
simpleDescriptors <- function(seqs, response=numeric(0), include.statistics=FALSE) {
desc <- descriptors(seqs, response, base.frame=defaultBaseMatrix[,c("count.BasicGroups", "count.AcidicGroups", "count.PolarGroups", "count.NonPolarGroups", "count.AromaticGroups", "count.ChargedGroups", "ALogP")], do.var=F,
alags=c(), do.counts=F, do.mean=T, do.position=F,
include.statistics=include.statistics, accuracy=0.001)
return(desc)
}
descriptors <- function(seqs, response=numeric(0), base.frame=NA, do.var=TRUE, alags=c(1,2,3), do.mean=TRUE, do.counts=FALSE, do.position=TRUE, alphabet=seqs@alphabet, include.statistics=TRUE, accuracy=0.01) {
if(include.statistics) {
if(ncol(seqs) >= 10) {
cat("Warning: the sequence space is very large for calculating statistics")
}
}
if(length(base.frame) == 1 && is.na(base.frame)) {
base.frame <- defaultBaseMatrix
}
base.matrix <- data.matrix(base.frame)
#For speed, create matrix vesion of sequences
seqmatrix <- data.matrix(seqs@.Data)
#get number of descriptors
base.num <- ncol(base.frame)
multiplier <- 0
if(!is.null(alags)) {
multiplier <- length(alags)
}
if(do.var)
multiplier <- multiplier + 1
if(do.mean)
multiplier <- multiplier + 1
if(do.position)
multiplier <- multiplier + ncol(seqmatrix)
desc.num <- base.num * multiplier
if(do.counts)
desc.num <- desc.num + length(alphabet)
#Build the names first
dnames <- rep("",desc.num)
index <- 1
for(i in 1:base.num) {
if(do.mean) {
dnames[index] <- paste(names(base.frame)[i], 'AVG', sep=".")
index <- index + 1
}
if(do.var) {
dnames[index] <- paste(names(base.frame)[i], 'VAR', sep=".")
index <- index + 1
}
if(!is.null(alags)) {
for(j in 1:length(alags)) {
dnames[index] <- paste(names(base.frame)[i], 'AUTO', alags[j], sep=".")
index <- index + 1
}
}
if(do.position) {
for(j in 1:ncol(seqmatrix)) {
dnames[index] <- paste(names(base.frame)[i], 'P',j,sep=".")
index <- index + 1
}
}
}
if(do.counts) {
count.start.index <- index
for(i in 1:length(alphabet)) {
dnames[index] <- paste('NUM', alphabet[i], sep=".")
index <- index + 1
}
}
#Now calculate the descriptors
desc <- empty.matrix(dnames, rownames(seqmatrix))
finished <- 0
res <- rep(NA, base.num * multiplier)
for(i in 1:nrow(seqmatrix)) {
index <- 1
for(j in 1:base.num) {
cur.desc <- base.matrix[seqmatrix[i,], j]
cur.mean <- mean(cur.desc)
cur.var <- var(cur.desc)
if(do.mean) {
res[index] <- cur.mean
index <- index + 1
}
if(do.var) {
res[index] <- cur.var
index <- index + 1
}
if(!is.null(alags)) {
for(k in 1:length(alags)) {
res[index] <- autocorrelation(cur.desc, cur.mean, cur.var, lag=alags[k])
index <- index + 1
}
}
if(do.position){
for(k in 1:ncol(seqmatrix)) {
res[index] <- cur.desc[k]
index <- index + 1
}
}
}
if(do.counts) {
for(j in 1:length(alphabet)) {
res[index] <- sum(seqmatrix[i,] == j)
index <- index + 1
}
}
desc[i,] <- res
}
cat("\n")
#Remove non-varying descriptors
desc.var <- apply(desc, MARGIN=2, FUN=var)
if(sum(which(desc.var == 0)) > 0) {
if(do.counts) {
if(sum(which(desc.var[1:count.start.index] == 0)) > 0) {
desc <- desc[-which(desc.var[1:count.start.index] == 0)]
}
}
else {
desc <- desc[-which(desc.var == 0)]
}
}
desc <- factory.descriptor(desc)
desc@response=response
#now, calculate means for each row
if(include.statistics) {
cat("Standard Error and variances are currently unimplemented\nCalculating Means..\n.")
dseqs <- decoys(seqs, 500)
ddesc <- matrix(0, nrow=nrow(dseqs), ncol=ncol(desc))
for(i in 1:nrow(dseqs)) {
index <- 1
for(j in 1:base.num) {
cat(paste("\r", format((base.num * multiplier * (i- 1) + index) * 100 / (base.num * multiplier * nrow(dseqs)),width=4, digits=3), "% "))
cur.desc <- base.matrix[dseqs[i,], j]
cur.mean <- mean(cur.desc)
cur.var <- var(cur.desc)
if(do.mean) {
ddesc[i, index] <- cur.mean
index <- index + 1
}
if(do.var) {
ddesc[i, index] <- cur.var
index <- index + 1
}
if(!is.null(alags)) {
for(k in 1:length(alags)) {
ddesc[i, index] <- autocorrelation(cur.desc, cur.mean, cur.var, lag=alags[k])
index <- index + 1
}
}
if(do.position){
for(k in 1:ncol(dseqs)) {
ddesc[i, index] <- cur.desc[k]
index <- index + 1
}
}
}
if(do.counts) {
for(k in 1:length(alphabet)) {
ddesc[i, index] <- sum(dseqs[i,] == k)
index <- index + 1
}
}
}
cat("\n")
#Calculate estimated p-values, the amount of overlap between the two distributions using Mann-Whitney test
index <- 1
for(i in 1:ncol(ddesc)) {
if(desc.var[i] != 0) {
x <- ddesc[,i]
y <- desc[,i]
p.value <- wilcox.test(x,y)$p.value
desc@pvalues[index] <- p.value
index <- index + 1
}
}
}
return(desc)
}
#Calculate the autocorrelation of the given function on the data with the given lag
autocorrelation <- function(data,ef=mean(data), v=var(data), lag=1) {
if(lag >= length(data)) {
return(0)
}
if(v == 0) {
return(0)
}
numer <- 0
for(i in 1:(length(data) - lag)) {
numer <- numer + (data[i] - ef) * (data[i + lag] - ef)
}
numer = numer / (length(data) - lag)
return(numer / v)
}
factory.descriptor <- function(data) {
d <- new("Descriptors", data.frame(data@.Data), row.names=rownames(data), names=colnames(data), response=numeric(0), pvalues=rep(0, ncol(data)))
names(d@pvalues) <- colnames(data)
return(d)
}
empty.df <- function(cnames, rnames, default=NA) {
df<-data.frame(matrix(rep(default,length(cnames)*length(rnames)), nrow=length(rnames)))
colnames(df)<-cnames
rownames(df) <- rnames
return(df)
}
empty.matrix <- function(cnames, rnames, default=NA) {
df<-matrix(rep(default,length(cnames)*length(rnames)), nrow=length(rnames))
colnames(df)<-cnames
rownames(df) <- rnames
return(df)
}
plot.Descriptors <- function(desc,...) {
#correlation
if(length(desc@response) > 0) {
desc.cor <- cor(desc, desc@response)
hist(desc.cor, main="Histogram of descriptor/response correlations", col="gray30")
desc.cor.sorted <- order(abs(desc.cor), decreasing=T)
par(cex.axis=0.7)
barplot(desc.cor[desc.cor.sorted[1:10]], names.arg=rownames(desc.cor)[desc.cor.sorted[1:10]], ylab="Correlation", col="gray30")
par(cex.axis=1)
}
desc.pc <- prcomp(desc, scale=T, center=T, tol=0.01, retx=T)
plot(desc.pc, main="Scree Plot", col="gray30")
plot(desc.pc$x[,1], desc.pc$x[,2], xlab="PC1", ylab="PC1", main="Projection")
biplot(desc.pc, main="Biplot")
}
setMethod("plot", "Descriptors", function(x, y, ...) plot.Descriptors(x, y, ...))
rbind.Sequences <- function(seq1, seq2) {
seqs <- new("Sequences", rbind(seq1@.Data, seq2@.Data), alphabet=unique(c(seq1@alphabet, seq2@alphabet)))
return(seqs)
}
setGeneric("rbind")
setMethod("rbind", "Sequences", function(...,deparse.level=1) rbind.Sequences(...))
decoys <- function(seqs, n=nrow(seqs)){
choices <- seqs@alphabet
decoys <- t(sapply(1:n, FUN=function(x) {sample(x=choices, size=ncol(seqs), replace=T)}))
decoyNames <- apply(decoys, MARGIN=1, FUN=function(x) {paste(seqs@alphabet[x], collapse="")})
rownames(decoys) <- decoyNames
return(new("Sequences", decoys, alphabet=seqs@alphabet))
}
#This function will give the FP and FN values. Classes should be
#positive for being in the motif and negative for not.
residuals.MotifModel <- function(model, seqs=model@seqs, classes=rep(0,nrow(seqs)), threshold = 10**-3) {
predictions <-predict(model, seqs)
result <- list(FP=0, FN=0, accuracy=0)
for(i in 1:length(predictions)) {
if(predictions[i] > threshold && classes[i] < 0) result$FP <- result$FP + 1
if(predictions[i] < threshold && classes[i] >= 0) result$FN <- result$FN + 1
}
if(sum(classes < 0 ) == 0) {
result$accuracy = 1 - result$FN / length(classes)
}
else {
result$accuracy=sqrt((sum(classes >= 0) - result$FN) * (sum(classes < 0) - result$FP) / (sum(classes >= 0) * sum(classes < 0)))
}
return(result)
}
residuals.MotifModelSet <- function(model, seqs=model@seqs, classes=rep(0, nrow(seqs)), threshold = 0) {
classes.hat <- classify(model, seqs, threshold)
result <- list(FP=0, FN=0, ACCM=0)
for(i in 1:length(classes.hat)) {
if(classes.hat[i] >= 0 && classes[i] < 0) result$FP <- result$FP + 1
if(classes.hat[i] < 0 && classes[i] >= 0) result$FN <- result$FN + 1
}
if(sum(classes < 0 ) == 0) {
result$accuracy = 1 - result$FN / length(classes)
}
else {
result$accuracy=sqrt((sum(classes >= 0) - result$FN) * (sum(classes < 0) - result$FP) / (sum(classes >= 0) * sum(classes < 0)))
}
return(result)
}
setGeneric("residuals")
setMethod("residuals", "MotifModel", function(object, ...) residuals.MotifModel(object, ...))
setMethod("residuals", "MotifModelSet", function(object,...) residuals.MotifModelSet(object,...))
FiveTwoCV.Sequences <- function(seqs, classes, motifNumber=1, motifModelType="fixed", width=3, verbose=F,...) {
result <- list(FP=0, FN=0, accuracy=0)
pclasses <- classes >= 0
pseqs.data <- seqs@.Data[pclasses,]
nclasses <- classes < 0
nseqs.data <- seqs@.Data[nclasses,]
for(i in 1:5) {
train.p <- sample(nrow(pseqs.data), size=nrow(pseqs.data) / 2, replace=F)
train.n <- sample(nrow(nseqs.data), size=nrow(nseqs.data) / 2, replace=F)
validate.p <- setdiff(1:nrow(pseqs.data), train.p)
validate.n <- setdiff(1:nrow(nseqs.data), train.n)
tseqs <- new("Sequences", rbind(pseqs.data[train.p,], nseqs.data[train.n,]), alphabet=seqs@alphabet)
vseqs <- new("Sequences", rbind(pseqs.data[validate.p,],nseqs.data[validate.n,]), alphabet=seqs@alphabet)
mdl <- motifModelSet(tseqs, motifNumber, motifModelType, width, verbose=verbose)
temp <- residuals(mdl, vseqs, classes=c(rep(1, length(validate.p)), rep(-1, length(validate.n))))
result$FP <- temp$FP + result$FP
result$FN <- temp$FN + result$FN
result$accuracy <- temp$accuracy + result$accuracy
mdl <- motifModelSet(vseqs, motifNumber, motifModelType, width, verbose=verbose)
temp <- residuals(mdl, tseqs, classes=c(rep(1, length(train.p)), rep(-1, length(train.n))))
result$FP <- temp$FP + result$FP
result$FN <- temp$FN + result$FN
result$accuracy <- temp$accuracy + result$accuracy
}
result$FP <- result$FP / (5 * nrow(nseqs.data))
result$FN <- result$FN / (5 * nrow(pseqs.data))
result$accuracy <- result$accuracy / 10
return(result)
}
FiveTwoCV.Descriptors <- function(desc, resp=desc@response, modelFxn) {
return(FiveTwoCV(desc@.data, resp, modelFxn))
}
FiveTwoCV.default <- function(data, resp, modelFxn) {
rows <- 1:nrow(data)
SMR <- 0
SVR <- 0
for(i in 1:5) {
train <- sample(nrow(data), size=nrow(data)/2, replace=F)
validate <- setdiff(rows,train)
mdl <- modelFxn(formula=resp~., data=cbind(resp=resp[train],data[train,]))
validate.yhat <- predict(object=mdl, newdata=data[validate,])
SVR <- SVR + sum((resp[validate] - validate.yhat)^2)
SMR <- SMR + sum((mean(resp[train]) - resp[validate])^2)
mdl <- modelFxn(resp~., data=cbind(resp=resp[validate],data[validate,]))
train.yhat <- predict(object=mdl, newdata=data[train,])
SVR <- SVR + sum((resp[train] - train.yhat)^2)
SMR <- SMR + sum((mean(resp[validate]) - resp[train])^2)
}
return(1 - SVR/SMR)
}
setGeneric("FiveTwoCV", def=function(data,...) standardGeneric("FiveTwoCV"), useAsDefault=function(data, ...) FiveTwoCV.default(data,...))
setMethod("FiveTwoCV", signature="Sequences", function(data, classes, ...) FiveTwoCV.Sequences(data, classes, ...))
setMethod("FiveTwoCV", signature="Descriptors", function(data, ...) FiveTwoCV.Descriptors(data, ...))
modelStep <- function(desc, modelFxn, fitness=FiveTwoCV.default, resp=desc@response, cols.start=NA, max=(nrow(desc) - 6) / 3, hypervalidateNum=5, verbose=TRUE){
if(is.na(cols.start)) {
cols.start <- colnames(desc)[order(abs(cor(desc, resp)), decreasing=T)[1]]
}
if(verbose) {
cat(paste("Performing stepwise regression starting at", cols.start, "and including up to", max,"descriptors\n"))
}
desc.hyperval.index <- sample(nrow(desc),size=hypervalidateNum)
dsub <- desc[-desc.hyperval.index,]
f.last <- 0
f.cur <- 0
h.last <- 0
h.cur <- 0
cols <- cols.start
while(f.cur >= f.last && length(cols) < max && h.cur >= h.last * 0.75) {
f.last <- f.cur
h.last <- h.cur
#attempt to add each column and check the fitness
cols.toadd <- NULL
for(i in 1:(ncol(desc))) {
cols.prop <- colnames(desc[i])
if(sum(cols == cols.prop)== 0) {
f.temp <- fitness(dsub[,c(cols,cols.prop)], resp[-desc.hyperval.index], modelFxn)
if(!(is.na(f.temp) || is.null(f.temp)) && (f.temp > f.cur || f.cur == 0)){
f.cur <- f.temp
cols.toadd <- cols.prop
}
}
}
cols <- c(cols,cols.toadd)
df <- data.frame(resp=resp[-desc.hyperval.index],dsub[,cols])
colnames(df) <- c("resp", cols)
mdl <- modelFxn(resp~., data=df)
yhat <- predict(mdl,desc[desc.hyperval.index,])
SVR <- sum((resp[desc.hyperval.index] - yhat)^2)
SMR <- sum((mean(resp[-desc.hyperval.index]) - resp[desc.hyperval.index])^2)
h.cur <- SVR/SMR
if(verbose) {
cat(paste("Current descriptors:",paste(cols, collapse=" "), "\n"))
cat(paste("Fitness:", f.cur, "\n"))
}
}
return(cols)
}
ROC <- function(model, seqs, classes, thresholdRange=NA) {
npos <- sum(classes >= 0)
nneg <- sum(classes < 0)
result <- data.frame(FP=0:nneg, FN=rep(NA,nneg+1))
if(is.na(thresholdRange)) {
preds <- predict(model, seqs)
thresholdRange <- seq(min(preds), max(preds), (max(preds) - min(preds)) / 100)
}
print("Building ROC curve...")
for(i in 1:length(thresholdRange)) {
cat(paste("\r", format(100 * i/length(thresholdRange), width=4, digits=3), "%"))
rs <- residuals(model, seqs, classes, threshold=thresholdRange[i])
fp <- rs$FP
fn <- rs$FN
if(is.na(result[result[,1] == fp,2]) || result[result[,1] == fp,2] > fn) {
result[result[,1] == fp, 2] = fn
}
}
result <- result[!is.na(result[,2]),]
result[,1] <- result[,1] / nneg
result[,2] <- 1 - result[,2] / npos
plot(result, xlab="False Positive Rate", ylab="True Positive Rate", type="l", col="red", xlim=c(0,1), ylim=c(0,1))
lines(seq(0,1,0.1), seq(0,1,0.1), lty=2)
return(result)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.