#' Randomly splits indices if there are an even number
#'
#' @param n the number of indices
#' @return a list with two disjoint sets of indices from 1 to n.
splitEvenly <- function(n)
{
if ( n %% 2 != 0)
{
stop("Not an even number of samples!")
}
sets <- split(sample.int(n), 1:2)
return(sets)
}
#' One iteration of biclustering maximization. Takes the current values of
#' d, a, b and if successful returns one iteration
#'
#' @param xDf matrix with rows representing genes and columns representing conditions (n x k)
#' @param yDf matrix with rows representing genes and columns representing conditions (n x k)
#' @param d vector of dimension k
#' @param a vector of dimension n
#' @param b vector of dimension n
#' @param lam maximum cluster size
#' @return A list containing maximum values of a, b, d
#' @export
biClustMax.optim <- function(xDf, yDf, d.start, a, b, lam, verbose = TRUE, optim.max = 4, lam.lwr = 3.5,
clustOptions = list())
{
# iterate until we've hit maximum iterations or QP has converged
if (verbose)
cat("Computing tilde matrices\n")
xTilde <- t(xDf %*% diag(d.start))
yTilde <- t(yDf %*% diag(d.start))
if (verbose)
cat("Computing SCCA component\n")
# First maximize a and b using SCCA
pen <- "LASSO"
if (!is.null(clustOptions$penalty))
pen <- clustOptions$penalty
lamx <- c(1,2,3)
if (!is.null(clustOptions$lamx))
lamx <- clustOptions$lamx
cv <- "CV.alt"
if (!is.null(clustOptions$cv))
cv <- clustOptions$cv
if (verbose)
cat("Using penalty: ", pen, "\n")
sccaRes <- scca(as.matrix(xTilde), as.matrix(yTilde),
nc = 1,
penalty = pen,
lamx = lamx,
lamy = lamx,
tuning = cv,
center = TRUE, scale = TRUE)
a <- as.numeric(sccaRes$A)
b <- as.numeric(sccaRes$B)
# XXX: Possibly remove... Unsure if I should scale in this fashion
xDf <- scale(xDf)
yDf <- scale(yDf)
# Now, maximize D using quadratic programming
# q <- - 2 * as.numeric((a %*% xDf) * (b %*% yDf))
q <- -as.numeric((a %*% xDf) * (b %*% yDf))
if (verbose)
cat("Solving optimization of d\n")
# lower bound is zero
constrMat<- diag(length(q))
# upper bound is one
constrMat <- rbind(constrMat, -diag(length(q)))
# regularize (sum(d) <= lam)
constrMat <- rbind(constrMat, rep.int(-1, length(q)))
constrLimit <- c(rep(0, length(q)),
rep(-1, length(q)),
-lam)
# FIXME: temporary until I figure out what is wrong with this function...
# regularize (sum(d) >= lam.lwr)
# constrMat <- rbind(constrMat, rep.int(1, length(q)))
# constrLimit <- c(rep(0, length(q)),
# rep(-1, length(q)),
# -lam, lam.lwr)
objectiveFn <- function(d, qVec)
{
# sum(0.5 * d^2 * qVec)
sum(d^2 * qVec)
}
optimIt <- 0
repeat {
optimRes <- constrOptim(d.start, objectiveFn, grad = NULL,
control = list(maxit = as.integer(1000000000)),
# control = list(maxit = 10000),
ui = constrMat, ci = constrLimit,
qVec = q
)
if (optimRes$convergence != 0)
{
warning("Error with convergence.\n", "\tError code: ",
optimRes$convergence, "\tMessage: ", optimRes$message,
immediate. = TRUE)
if (optimIt >= optim.max)
{
warning("*****Reached maximum number of iterations of constrOptim. Exiting",
immediate. = TRUE)
break
}
optimIt <- optimIt + 1
d.start <- optimRes$par
cat("\t\tRestarting optimization at new point.\n")
cat("\t\tconstrOptim iteration: ", optimIt + 1, "\n")
}
else
{
break
}
}
return(list(a = a, b = b, d = optimRes$par, q, sccaLam = sccaRes$lambda))
}
# TODO: Replace the regular distance in maximizeOneSplit with pDiff and see how
# affects convergence
pDiff <- function(old, new)
{
num <- dist(rbind(old, new))[1]
denom <- sqrt(sum(old^2))
num / denom
}
##' Driver function to find solution for
maximizeOneSplit <- function(geneDf, lam, epsA = 0.1, epsB = 0.1, epsD = 0.1, maxIt = 100, lam.lwr,
clustOptions = list())
{
splitIdx <- splitEvenly(nrow(geneDf))
xDf <- geneDf[splitIdx[[1]], ]
yDf <- geneDf[splitIdx[[2]], ]
# TODO: implement cross validation for lambda
# lam <- round(min(dim(xDf)) * .3)
# lam <- 20 * 2
# randomly initialize d
# FIXME: If there are lots of conditions, possible that won't be able to
# find d that satisfy the constraint (if lam is sufficiently small. Come up
# with different way to randomly assign values
d <- 0
minUnif <- 0.05
maxUnif <- 1
repeat {
d <- runif(ncol(xDf), max = maxUnif)
if (sum(d) <= lam)
break
else
maxUnif <- min(0.05, maxUnif - 0.05)
}
# FIXME: temporarily commenting out until figure out why not working correctly
# UPDATE: This bug appears when force there to be a minimum number of conditions which are being selected
# repeat {
# d <- runif(ncol(xDf), min = minUnif, max = maxUnif)
# sumD <- sum(d)
# if (sumD <= lam & sumD >= lam.lwr)
# break
# else if (sumD < lam.lwr)
# minUnif <- min(minUnif + 0.03, maxUnif - 0.03)
# else
# maxUnif <- max(minUnif + 0.04, maxUnif - 0.04)
# }
# if (sum(d) < lam.lwr)
# {
# cat("ERROR! sum(d) < lam.lwr\n")
# cat("ERROR! sum(d) < lam.lwr\n")
# cat("ERROR! sum(d) < lam.lwr\n")
# cat("ERROR! sum(d) < lam.lwr\n")
# cat("ERROR! sum(d) < lam.lwr\n")
# cat("ERROR! sum(d) < lam.lwr\n")
# }
cat("sum(d)", sum(d), "\n")
# NB: values of a and b are not important currently since evaluated after d is set.
# If ever change the order, fix this.
a <- b <- runif(nrow(xDf), min = -1, max = 1)
it <- 1
curSol <- 0
repeat {
cat("\tOne split iteration: ", it, "\n")
curSol <- biClustMax.optim(xDf, yDf, d, a, b, lam, lam.lwr = lam.lwr, clustOptions = clustOptions)
cat ("\t\tdist: ",
dist(rbind(curSol$a, a))[1], "\t",
dist(rbind(curSol$b, b))[1], "\t",
dist(rbind(curSol$d, d))[1], "\n")
if (dist(rbind(curSol$a, a))[1] < epsA &
dist(rbind(curSol$b, b))[1] < epsB &
dist(rbind(curSol$d, d))[1] < epsD & it > 2) # potentially remove it > 2
{
cat("\tConverged.\n")
break
}
a <- curSol$a
b <- curSol$b
d <- curSol$d
it <- it + 1
if (it > maxIt)
{
cat("Warning: exceed maximum number of iterations (", maxIt, ")\n")
break
}
}
# NB: If there is something funky with the distribution of a and b,
# check here... though this should work correctly
# a <- rep.int(0, length(a))
# b <- rep.int(0, length(b))
ab <- rep.int(0, length(b*2))
ab[splitIdx[[1]]] <- curSol$a
ab[splitIdx[[2]]] <- curSol$b
# a[splitIdx[[1]]] <- curSol$a
# b[splitIdx[[2]]] <- curSol$b
return(list(a = a, b = b, ab = ab, d = round(curSol$d, digits = 4),
splitIdx = splitIdx, sccaLam = curSol$sccaLam))
}
#' Given a set of features, will find the most "dominant" bicluster. Works in
#' parallel using library "multicore." To set the number of cores used, set
#' options(cores = N).
#'
#' @param geneDf data.frame with genes on rows and columns defining conditions
#' @param nSamples integer denoting the number of permutations to perform
#' @param lam regularization parameter for conditions (maximum number of conditions allows in a bicluster)
#' @export
biclusteringPar <- function(geneDf, nSamples = 100, lam, lam.lwr = 3.5, clustOptions = list())
{
mclapply(1:nSamples, function(it) {
cat("Biclustering iteration: ", it, "\n")
curSol <- maximizeOneSplit(geneDf, lam = lam, lam.lwr = lam.lwr, clustOptions = clustOptions)
return(curSol)
})
}
bcSubSampleSerial <- function(geneDf, nSamples = 100, lam, propSample = 0.6, lam.lwr = 3.5,
clustOptions = list())
{
if (propSample > 1 | propSample < 0.01)
stop("Invalid range for propSample")
nRowsSample <- round(propSample * nrow(geneDf) )
if (nRowsSample %% 2)
nRowsSample <- nRowsSample + 1
cat("Sampling ", nRowsSample, " features\n")
lapply(1:nSamples, function(it) {
cat("**Biclustering iteration: ", it, "\n")
sampIdx <- sample.int(nrow(geneDf), size = nRowsSample)
abSol <- rep.int(NA, nrow(geneDf))
curSol <- maximizeOneSplit(geneDf[sampIdx,], lam, lam.lwr = lam.lwr,
clustOptions = clustOptions)
abSol[sampIdx] <- curSol$ab
d <- curSol$d
return(list(ab = abSol, d = d, sccaLam = curSol$sccaLam))
})
}
#' Given a set of features, will find the most "dominant" bicluster. Works in
#' parallel using library "multicore." To set the number of cores used, set
#' options(cores = N).
#'
#' @param geneDf data.frame with genes on rows and columns defining conditions
#' @param nSamples integer denoting the number of permutations to perform
#' @param lam regularization parameter for conditions (maximum number of conditions allows in a bicluster)
#' @export
bcSubSamplePar <- function(geneDf, nSamples = 100, lam, propSample = 0.6, lam.lwr = 3.5,
clustOptions = list())
{
if (propSample > 1 | propSample < 0.01)
stop("Invalid range for propSample")
nRowsSample <- round(propSample * nrow(geneDf) )
if (nRowsSample %% 2)
nRowsSample <- nRowsSample + 1
cat("Sampling ", nRowsSample, " features\n")
mclapply(1:nSamples, function(it) {
cat("**Biclustering iteration: ", it, "\n")
sampIdx <- sample.int(nrow(geneDf), size = nRowsSample)
abSol <- rep.int(NA, nrow(geneDf))
curSol <- maximizeOneSplit(geneDf[sampIdx,], lam, lam.lwr = lam.lwr,
clustOptions = clustOptions)
abSol[sampIdx] <- curSol$ab
d <- curSol$d
return(list(ab = abSol, d = d, sccaLam = curSol$sccaLam))
})
}
# XXX: This is the best performing method
postSubSample.pca <- function(subSampleSol, abThresh = 0.6, dQuant = 0.5)
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
bootStrapNAs <- function(dat, lwr, upr)
{
for (icol in 1:ncol(dat))
{
rng <- quantile(dat[,icol], probs = c(lwr, upr), na.rm = T)
# FIXME: why is this not matching at least some things?
whichValid <- which( (rng[1] <= dat[,icol]) & (dat[,icol] <= rng[2]))
nNa <- sum(is.na(dat[,icol]))
if (length(whichValid) == 0)
{
# cat("len == 0", icol, "\n")
# print(sum(!is.na(dat[,icol])))
whichValid <- which(!is.na(dat[,icol]))
# print(whichValid)
# print("----")
}
samps <- NA
if (length(whichValid) == 1)
samps <- rep.int(whichValid, nNa)
else
samps <- sample(whichValid, nNa, replace = T)
dat[is.na(dat[,icol]), icol] <- dat[samps,icol]
# if (any(is.na(dat[,icol])))
# {
# cat("samples:", icol, "\n")
# print(dat[samps,icol])
# print("whichValid values:")
# print(dat[whichValid,icol])
# print("whichValid:")
# print(length(whichValid))
# print(whichValid)
# }
}
return(dat)
}
ab <- bootStrapNAs(ab, 0.05, 0.95)
# print(sum(is.infinite(ab)))
# print(sum(is.na(ab)))
# a_ply(ab, 2, function(x)
# {
# if(any(is.na(x))) print(x)
# }
# )
# print(summary(ab))
# print(summary(t(ab)))
pcaAB <- prcomp(ab, center = F, scale. = F)
pcaAB$rotation <- as.data.frame(data.matrix(as.data.frame(pcaAB$rotation, stringsAsFactors = F)))
pcaAB$rotation[,"mean"] <- apply(ab, 2, mean, na.rm = T)
pcaAB$rotation[,"sd"] <- apply(ab, 2, sd, na.rm = T)
pcaAB$rotation[,"median"] <- apply(ab, 2, median, na.rm = T)
pcaAB$rotation[,"var"] <- apply(ab, 2, var, na.rm = T)
pcaAB$rotation[,"cov"] <- with(pcaAB$rotation, sd / mean)
pcaD <- prcomp(d, center = F, scale. = F)
dQuant <- apply(d, 2, quantile, probs = dQuant, na.rm = T)
dSd <- apply(d, 2, sd, na.rm = T)
clustKmeans <- function(dat, minK = 2)
{
gap <- clusGap(as.matrix(dat), kmeans, 5)
nk <- maxSE(gap$Tab[,"gap"], gap$Tab[,"SE.sim"])
if (nk == 1)
nk <- minK
cat("Conditions k: ", nk, "\n")
kRes <- kmeans(dat, nk, nstart = 20)
kMax <- which.max(kRes$centers[,1])
which(kRes$cluster == kMax)
}
# colIdx <- clustKmeans(cbind(dQuant, dSd))
# rowIdx <- clustKmeans(with(pcaAB$rotation, cbind(-PC1 * sdev[1], PC2 * sdev[2])))
df <- cbind(-pcaAB$rotation$PC1 * pcaAB$sdev[1],
pcaAB$rotation$PC2 * pcaAB$sdev[2])
rowIdx <- clustKmeans(df)
df2 <- cbind(-pcaD$rotation[,"PC1"] * pcaD$sdev[1])
# pcaD$rotation[,"PC2"] * pcaD$sdev[2])
colIdx <- clustKmeans(df2)
# colIdx <- clustKmeans(cbind(dQuant))
pcaD$rotation <- data.frame(pcaD$rotation)
pcaD$rotation[,"cluster"] <- "background"
pcaD$rotation[colIdx,"cluster"] <- "bicluster"
# return(list(abDat = pcaAB$rotation, dDat = pcaD, cluster = list(rowIdx = rowIdx, colIdx = colIdx)))
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
# TODO: include minimum condition size to be 3
# TODO: run this for 20 - 50
# TODO: For selecting the "optimal" condition, look at average correlation plot
# (should increase to a flat region) and also look at the number of conditions
# selected. Should also flatten out
# TODO: instead of minLam and maxLam take a vector of lams
optConditionSize <- function(df, minLam, maxLam, cutoff = 0.6,
bcMethod = bcSubSamplePar,
postProcess = postSubSample.pca, ...)
{
D <- as.data.frame(matrix(0, nrow = maxLam - minLam + 1, ncol = ncol(df)))
rownames(D) <- minLam:maxLam
sols <- list()
it <- 1
for (l in minLam:maxLam)
{
cat("Doing optimization for lambda = ", l, "\n")
# temporarily supress output
# sink("/dev/null")
compTime <- system.time({
curSol <- bcMethod(df, lam = l, lam.lwr = 3)
sols[[it]] <- curSol
if (!is.null(postProcess))
{
curClust <- postProcess(curSol, ...)
D[it, curClust$colIdx] <- 1
}
it <- it + 1
})
# sink()
print(compTime)
cat(length(curClust$colIdx), "\n")
}
return(list(sol = sols, D = D))
}
postSubSample <- function(subSampleSol, percentile = 0.75)
{
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
abGt <- apply(ab, 2, function(col) {
# median(col, na.rm = TRUE)
mean(col, na.rm = TRUE)
# quantile(col, probs = percentile, na.rm = TRUE)
})
dGt <- apply(d, 2, function(col)
{
median(col, na.rm = TRUE)
})
# dGt <- t(dGt)
return(list(ab = abGt, d = dGt))
}
postSubSample.percent <- function(subSampleSol, percentile = 0.95, eps = 0.05)
{
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- sapply(subSampleSol, function (x) x$d)
# want to implement in top 5%, 90% it is chosen
abGt <- apply(ab, 1, function(row) {
# median(col, na.rm = TRUE)
row >= quantile(row, probs = percentile, na.rm = TRUE)
})
rowIdx <- which(apply(abGt, 1, mean, na.rm = T) >= eps)
# dGt <- apply(d, 1, function(row)
# {
# row >= quantile(row, probs = percentile, na.rm = TRUE)
# })
# dGt <- t(dGt)
# colIdx <- which(apply(d, 1, mean, na.rm = T) >= eps)
cutHCluster <- function(mat)
{
hc <- hclust(dist(mat))
hcCuts <- cutree(hc, 2)
cutIdx <- 1
if (mean(mat[which(hcCuts == 1), ]) <
mean(mat[which(hcCuts == 2), ]))
{
cutIdx <- 2
}
cutIdx <- which(hcCuts == cutIdx)
return(cutIdx)
}
colIdx <- cutHCluster(d)
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
postSubSample.percent2 <- function(subSampleSol, percentile1 = 0.95, eps1 = 0.5,
percentile2 = 0.95, eps2 = 0.5)
{
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
# want to implement in top 5%, 90% it is chosen
abGt <- apply(ab, 1, function(row) {
# median(col, na.rm = TRUE)
row >= quantile(row, probs = percentile1, na.rm = TRUE)
})
rowIdx <- which(apply(abGt, 1, mean, na.rm = T) >= eps1)
dGt <- apply(d, 1, function(row) {
row >= quantile(row, probs = percentile2, na.rm = T)
})
colIdx <- which(apply(dGt, 1, mean, na.rm = T) >= eps2)
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
postSubSample.sort <- function(subSampleSol, percentile1 = 0.95, eps1 = 0.5,
percentile2 = 0.95, eps2 = 0.5)
{
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
# want to implement in top 5%, 90% it is chosen
abGt <- apply(ab, 1, function(row) {
# median(col, na.rm = TRUE)
row >= quantile(row, probs = percentile1, na.rm = TRUE)
})
rowIdx <- which(apply(abGt, 1, mean, na.rm = T) >= eps1)
dSort <- t(apply(d, 2, function(col) {sort(col)}))
cutHCluster <- function(mat)
{
hc <- hclust(dist(mat))
hcCuts <- cutree(hc, 2)
cutIdx <- 1
if (mean(mat[which(hcCuts == 1), ]) <
mean(mat[which(hcCuts == 2), ]))
{
cutIdx <- 2
}
cutIdx <- which(hcCuts == cutIdx)
return(cutIdx)
}
colIdx <- cutHCluster(dSort)
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
postSubSample.top <- function(subSampleSol, percentile1 = 0.95, eps1 = 0.5,
nc)
{
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- sapply(subSampleSol, function (x) x$d)
# want to implement in top 5%, 90% it is chosen
abGt <- apply(ab, 1, function(row) {
# median(col, na.rm = TRUE)
row >= quantile(row, probs = percentile1, na.rm = TRUE)
})
rowIdx <- which(apply(abGt, 1, mean, na.rm = T) >= eps1)
dMean <- apply(d, 1, mean)
colIdx <- order(dMean, decreasing = T)[1:nc]
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
postSubSample.median <- function(subSampleSol, abQuant = 0.5)
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
abMed <- apply(ab, 2, quantile, probs = abQuant, na.rm = T)
dMed <- apply(d, 2, median)
return(list(ab = abMed, d = dMed))
}
postSubSample.median.kmeans <- function(subSampleSol, abQuant = 0.5, dQuant = 0.5,
seMethod = "firstSEmax")
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
abQuant <- apply(ab, 2, quantile, probs = abQuant, na.rm = T)
abSd <- apply(ab, 2, sd, na.rm = T)
dQuant <- apply(d, 2, quantile, probs = dQuant, na.rm = T)
dSd <- apply(d, 2, sd, na.rm = T)
clustKmeans <- function(dat)
{
gap <- clusGap(as.matrix(dat), kmeans, 5)
nk <- maxSE(gap$Tab[,"gap"], gap$Tab[,"SE.sim"])
if (nk == 1)
nk <- 2
kRes <- kmeans(dat, nk, nstart = 20)
kMax <- which.max(kRes$centers[,1])
which(kRes$cluster == kMax)
}
rowIdx <- clustKmeans(cbind(abQuant, abSd, abQuant / abSd))
colIdx <- clustKmeans(cbind(dQuant, dSd))
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
# credit to stackoverflow:
# http://stackoverflow.com/questions/2547402/standard-library-function-in-r-for-finding-the-mode
findMode <- function(x, na.rm = T)
{
if (na.rm)
x <- x[!is.na(x)]
ux <- unique(x)
if (length(ux) == length(x))
print("ERROR: No mode")
ux[which.max(tabulate(match(x, ux)))]
}
postSubSample.median.kmeans2 <- function(subSampleSol, abQuant = 0.5, dQuant = 0.5,
seMethod = "firstSEmax")
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
abStats <- data.frame(med = apply(ab, 2, quantile, probs = abQuant, na.rm = T),
upr = apply(ab, 2, quantile, probs = 0.75, na.rm = T),
mode = apply(round(ab, digits = 3), 2, findMode, na.rm = T),
sd = apply(ab, 2, sd, na.rm = T),
min = apply(ab, 2, min, na.rm = T))
abStats$medRate <- with(abStats, med / sd)
abStats$mean <- apply(ab, 2, mean, na.rm = T)
dQuant <- apply(d, 2, quantile, probs = dQuant, na.rm = T)
dSd <- apply(d, 2, sd, na.rm = T)
# abMode <- apply(round(ab, digits = 3), 2, findMode, na.rm = T)
clustKmeans <- function(dat, minK = 2)
{
gap <- clusGap(as.matrix(dat), kmeans, 5)
nk <- maxSE(gap$Tab[,"gap"], gap$Tab[,"SE.sim"])
if (nk == 1)
nk <- minK
kRes <- kmeans(dat, nk, nstart = 20)
kMax <- which.max(kRes$centers[,1])
which(kRes$cluster == kMax)
}
# rowIdx <- clustKmeans(abStats, 3)
rowIdx <- clustKmeans(with(abStats, cbind(medRate, sd)), 3)
# clustKmeans(with(abStats, cbind(med, sd)), 2)
# clustKmeans(with(abStats, cbind(mode, medRate)), 3)
# clustKmeans(with(abStats, cbind(mean, medRate)), 3)
# clustKmeans(with(abStats, cbind(mean, sd, medRate)), 3)
# clustKmeans(with(abStats, cbind(medRate, sd)), 4)
# mat <- with(abStats, as.matrix(cbind(medRate, sd)))
colIdx <- clustKmeans(cbind(dQuant, dSd))
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
vScore <- function(set1, set2)
{
length(intersect(set1, set2)) / length(union(set1, set2))
}
vScore2 <- function(...)
{
length(intersect(...)) / length(union(...))
}
postSubSample.tight <- function(subSampleSol, abThresh = 0.6, dQuant = 0.5)
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
abStats <- data.frame(med = apply(ab, 2, median, na.rm = T),
upr = apply(ab, 2, quantile, probs = 0.75, na.rm = T),
mode = apply(round(ab, digits = 3), 2, findMode, na.rm = T),
sd = apply(ab, 2, sd, na.rm = T),
min = apply(ab, 2, min, na.rm = T))
abStats$medRate <- with(abStats, med / sd)
abStats$mean <- apply(ab, 2, mean, na.rm = T)
dQuant <- apply(d, 2, quantile, probs = dQuant, na.rm = T)
dSd <- apply(d, 2, sd, na.rm = T)
tight <- function(dat, minK = 2, maxK = 7, thresh = abThresh)
{
getBestClust <- function(df, theK)
{
kRes <- kmeans(df, theK, nstart = 20)
kMax <- which.max(kRes$centers[,1])
which(kRes$cluster == kMax)
}
clusts <- lapply(minK:maxK, function(curK)
{
getBestClust(dat, curK)
})
vs <- sapply((minK+1):maxK, function(curK)
{
vScore( clusts[[curK - minK]], clusts[[curK - minK + 1]] )
})
vsGt <- min(which(vs >= thresh))
bestK <- minK + vsGt
cat("Choosing tight cluster: ", bestK, "\n")
getBestClust(dat, bestK)
}
# rowIdx <- with(abStats, tight(cbind(medRate, sd)))
# rowIdx <- with(abStats, tight(cbind(medRate)))
# rowIdx <- with(abStats, tight(cbind(med, sd)))
# rowIdx <- with(abStats, tight(cbind(med, sd)))
clustKmeans <- function(dat, minK = 2)
{
gap <- clusGap(as.matrix(dat), kmeans, 5)
nk <- maxSE(gap$Tab[,"gap"], gap$Tab[,"SE.sim"])
if (nk == 1)
nk <- minK
cat("Conditions k: ", nk, "\n")
kRes <- kmeans(dat, nk, nstart = 20)
kMax <- which.max(kRes$centers[,1])
which(kRes$cluster == kMax)
}
# colIdx <- clustKmeans(cbind(dQuant, dSd))
colIdx <- clustKmeans(cbind(dQuant))
return(list(abDat = abStats, cluster = list(rowIdx = rowIdx, colIdx = colIdx)))
}
postSubSample.tightPCA <- function(subSampleSol, abThresh = 0.6, dQuant = 0.5)
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
bootStrapNAs <- function(dat, lwr, upr)
{
for (col in 1:ncol(dat))
{
rng <- quantile(dat[,col], probs = c(lwr, upr), na.rm = T)
whichValid <- which(rng[1] <= dat[,col] & dat[,col] <= rng[2])
nNa <- sum(is.na(dat[,col]))
dat[is.na(dat[,col]), col] <- dat[sample(whichValid, nNa, replace = T), col]
}
return(dat)
}
# replace NAs w/ a bootstrap
ab <- bootStrapNAs(ab, 0.05, 0.95)
pcaAB <- prcomp(ab, center = F, scale. = F)
pcaAB$rotation <- as.data.frame(data.matrix(as.data.frame(pcaAB$rotation, stringsAsFactors = F)))
pcaAB$rotation[,"median"] <- apply(ab, 2, median, na.rm = T)
pcaAB$rotation[,"mean"] <- apply(ab, 2, mean, na.rm = T)
pcaAB$rotation[,"var"] <- apply(ab, 2, var, na.rm = T)
pcaAB$rotation[,"sd"] <- apply(ab, 2, sd, na.rm = T)
pcaAB$rotation[,"cov"] <- pcaAB$rotation[,"sd"] / pcaAB$rotation[,"mean"]
dQuant <- apply(d, 2, quantile, probs = dQuant, na.rm = T)
dSd <- apply(d, 2, sd, na.rm = T)
# pcaAB$rotation[,c("PC2")] <- 0
pcaAB$rotation[,"zeros"] <- 0
# df <- pcaAB$rotation[,c("PC1", "PC2", "cov")]
df <- pcaAB$rotation[,c("PC1", "zeros")]
# df <- pcaAB$rotation[,c("PC1", "cov")]
# df <- pcaAB$rotation[,c("PC1", "PC2")]
# what if you remove the things that are so far away in distn
# toInc <- which(df[,1] <= quantile(df[,1], probs = 0.5) )
toInc <- which(df[,1] <= quantile(df[,1], probs = 1) )
tc <- tight.clust(df[toInc,], 5, 10, standardize.gene = F)
# tc <- tight.clust(df[toInc,], 2, 7, standardize.gene = F)
pcaAB$rotation[,"cluster"] <- -1
pcaAB$rotation[toInc,"cluster"] <- tc$cluster
pcaAB$rotation[,"cluster"] <- as.factor(pcaAB$rotation[,"cluster"])
clustCenter <- pcaAB$rotation %.% group_by(cluster) %.% summarize(center = mean(PC1))
minClust <- which.min(clustCenter$center)
# minClust <- 1
# rowIdx <- which(1 == pcaAB$rotation[,"cluster"])
rowIdx <- which(minClust == pcaAB$rotation[,"cluster"])
clustKmeans <- function(dat, minK = 2)
{
gap <- clusGap(as.matrix(dat), kmeans, 5)
nk <- maxSE(gap$Tab[,"gap"], gap$Tab[,"SE.sim"])
if (nk == 1)
nk <- minK
cat("Conditions k: ", nk, "\n")
kRes <- kmeans(dat, nk, nstart = 20)
kMax <- which.max(kRes$centers[,1])
which(kRes$cluster == kMax)
}
# colIdx <- clustKmeans(cbind(dQuant, dSd))
colIdx <- clustKmeans(cbind(dQuant))
return(list(abDat = pcaAB$rotation, tc = tc$cluster, cluster = list(rowIdx = rowIdx, colIdx = colIdx)))
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
# Each row is a gene, each column is a different iteration
getA <- function(bcSol, takeAbs = TRUE)
{
if (takeAbs)
return( abs(sapply(bcSol, function (x) x$ab)) )
sapply(bcSol, function (x) x$ab)
}
# Each row is a condition, each column is a different iteration
getD <- function(bcSol)
{
sapply(bcSol, function (x) x$d)
}
postSubSample.mean.kmeans <- function(subSampleSol, abQuant = 0.5, dQuant = 0.5,
seMethod = "firstSEmax")
{
# rows are different permutations,
# columns are the genes or conditions
ab <- t(sapply(subSampleSol, function (x) abs(x$ab)))
d <- t(sapply(subSampleSol, function (x) x$d))
abQuant <- apply(ab, 2, mean, na.rm = T)
dQuant <- apply(d, 2, mean,na.rm = T)
clustKmeans <- function(dat)
{
gap <- clusGap(as.matrix(dat), kmeans, 5)
nk <- maxSE(gap$Tab[,"gap"], gap$Tab[,"SE.sim"], method = seMethod)
if (nk == 1)
nk <- 2
kRes <- kmeans(dat, nk, nstart = 20)
kMax <- which.max(kRes$centers)
which(kRes$cluster == kMax)
}
rowIdx <- clustKmeans(abQuant)
colIdx <- clustKmeans(dQuant)
return(list(rowIdx = rowIdx, colIdx = colIdx))
}
post.hclust <- function(postSample, nAB = 3, nD = 2)
{
clustAB <- cutree(hclust(dist(postSample$ab)), nAB)
clustABCenter <- sapply(unique(clustAB),
function(it) median(postSample$ab[which(clustAB == it)]))
clustABMax <- which.max(clustABCenter)
rowIdx <- which(clustAB == clustABMax)
clustD <- cutree(hclust(dist(postSample$d)), nD)
clustDCenter <- sapply(unique(clustD),
function(it) median(postSample$d[which(clustD == it)]))
clustDMax <- which.max(clustDCenter)
colIdx <- which(clustD == clustDMax)
list(list(rowIdx = rowIdx, colIdx = colIdx))
}
post.kmeans <- function(postSample, nAB = 3, nD = 2)
{
kClustAB <- kmeans(postSample$ab, nAB)
kClustABMax <- which.max(kClustAB$centers)
rowIdx <- which(kClustAB$cluster == kClustABMax)
clustD <- kmeans(postSubSample$d, nD)
clustDMax <- which.max(clustD$center)
colIdx <- which(clustD$cluster == clustDMax)
list(list(rowIdx = rowIdx, colIdx = colIdx))
}
bcMultipleClusters.subSample <- function(geneDf, lam, nClusters, nSamples = 100,
propSample = 0.6)
{
allBcSol <- list()
clusters <- list()
for (clust in 1:nClusters)
{
bcSol <- bcSubSamplePar(geneDf, nSamples, lam, propSample)
allBcSol <- append(allBcSol, list(bcSol))
postSol <- postSubSample.percent(bcSol, 0.9, 0.5)
rowIdx <- postSol$rowIdx
colIdx <- postSol$colIdx
clusters <- append(clusters, list(list(rowIdx = rowIdx, colIdx = colIdx)))
# given a list of rows and columns, converts pairwise combinations into
# 1D index for matrix
get1DIdx <- function(rows, cols)
{
allPairs <- expand.grid(rows, cols)
nrow(geneDf) * (allPairs[, 2] - 1) + allPairs[, 1]
}
# mask each gene with random values from the rest of the matrix also
# considered doing this with JUST the other values of the gene... need
# to experiment with that
clustIdx <- get1DIdx(rowIdx, colIdx)
geneDf[clustIdx] <- sample(geneDf[-clustIdx], length(clustIdx))
# temporarily sample rnorm to debug FP issue
# geneDf[clustIdx] <- rnorm(length(clustIdx))
}
return(list(allBcSol = allBcSol, clusters = clusters))
}
bcMultipleClusters <- function(geneDf, lam, nClusters, nSamples = 100)
{
allBcSol <- list()
clusters <- list()
for (clust in 1:nClusters)
{
bcSol <- biclusteringPar(geneDf, nSamples, lam)
allBcSol <- append(allBcSol, list(bcSol))
# Take abs() of A since depending on partition, correlation could be in
# opposite direction
A <- abs(sapply(bcSol, function(x) x$ab))
D <- sapply(bcSol, function(x) x$d)
# find features/conditions which are clustered
# to find the cluster, find cut with fewest elements...
# XXX: this is not necessarily correct. For example, it is *possible*
# that a cluster might contain more than half of the genes and only
# a few conditions. Think about a "better" way to do this... i.e.
# perhaps a matrix multiplication
# returns which indices correspond to a cluster
# TODO: fix this to get the cluster with the most non-zero entries
cutHCluster <- function(mat)
{
hc <- hclust(dist(mat))
hcCuts <- cutree(hc, 2)
# cutIdx <- 1
# if (mean(hcCuts == 1) > 0.5)
# {
# cutIdx <- 2
# }
# cutIdx <- which(hcCuts == cutIdx)
cutIdx <- 1
if (mean(mat[which(hcCuts == 1), ]) <
mean(mat[which(hcCuts == 2), ]))
{
cutIdx <- 2
}
cutIdx <- which(hcCuts == cutIdx)
return(cutIdx)
}
rowIdx <- cutHCluster(A)
colIdx <- cutHCluster(D)
clusters <- append(clusters, list(list(rowIdx = rowIdx, colIdx = colIdx)))
# given a list of rows and columns, converts pairwise combinations into
# 1D index for matrix
get1DIdx <- function(rows, cols)
{
allPairs <- expand.grid(rows, cols)
nrow(geneDf) * (allPairs[, 2] - 1) + allPairs[, 1]
}
# mask each gene with random values from the rest of the matrix also
# considered doing this with JUST the other values of the gene... need
# to experiment with that
clustIdx <- get1DIdx(rowIdx, colIdx)
geneDf[clustIdx] <- sample(geneDf[-clustIdx], length(clustIdx))
# temporarily sample rnorm to debug FP issue
# geneDf[clustIdx] <- rnorm(length(clustIdx))
}
return(list(allBcSol = allBcSol, clusters = clusters))
}
bcMultipleClusters.kmeans <- function(geneDf, lam, nClusters, nSamples = 100)
{
allBcSol <- list()
clusters <- list()
for (clust in 1:nClusters)
{
bcSol <- biclusteringPar(geneDf, nSamples, lam)
allBcSol <- append(allBcSol, list(bcSol))
# Take abs() of A since depending on partition, correlation could be in
# opposite direction
A <- abs(sapply(bcSol, function(x) x$ab))
D <- sapply(bcSol, function(x) x$d)
# find features/conditions which are clustered
# to find the cluster, find cut with fewest elements...
# XXX: this is not necessarily correct. For example, it is *possible*
# that a cluster might contain more than half of the genes and only
# a few conditions. Think about a "better" way to do this... i.e.
# perhaps a matrix multiplication
# returns which indices correspond to a cluster
# TODO: fix this to get the cluster with the most non-zero entries
cutKcluster <- function(mat)
{
cuts <- kmeans(mat, 2)$cluster
cutIdx <- 1
if (mean(mat[which(cuts == 1), ]) <
mean(mat[which(cuts == 2), ]))
{
cutIdx <- 2
}
cutIdx <- which(cuts == cutIdx)
return(cutIdx)
}
rowIdx <- cutKcluster(A)
colIdx <- cutKcluster(D)
clusters <- append(clusters, list(list(rowIdx = rowIdx, colIdx = colIdx)))
# given a list of rows and columns, converts pairwise combinations into
# 1D index for matrix
get1DIdx <- function(rows, cols)
{
allPairs <- expand.grid(rows, cols)
nrow(geneDf) * (allPairs[, 2] - 1) + allPairs[, 1]
}
# mask each gene with random values from the rest of the matrix also
# considered doing this with JUST the other values of the gene... need
# to experiment with that
clustIdx <- get1DIdx(rowIdx, colIdx)
# geneDf[clustIdx] <- sample(geneDf[-clustIdx], length(clustIdx))
# temporarily sample rnorm to debug FP issue
geneDf[clustIdx] <- rnorm(length(clustIdx))
}
return(list(allBcSol = allBcSol, clusters = clusters))
}
# XXX: OBSOLETE! This is mostly still here for testing...
biclustering <- function(geneDf, nSamples = 100)
{
it <- 1
aVal <- list()
bVal <- list()
dVal <- list()
abVal <- list()
splitIdx <- list()
while (it <= nSamples)
{
cat("Biclustering iteration: ", it, "\n")
curSol <- maximizeOneSplit(geneDf)
aVal <- append(aVal, list(curSol$a))
bVal <- append(bVal, list(curSol$b))
dVal <- append(dVal, list(curSol$d))
abVal <- append(abVal, list(curSol$ab))
splitIdx <- append(splitIdx, list(curSol$splitIdx))
it <- it + 1
}
return(list(a = aVal, b = bVal, ab = abVal, d = dVal, splitIdx = splitIdx))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.