# ======================================================================================================
#
# Aligned svd: svd plus aligning the result along the average expression of the input data.
#
# ======================================================================================================
# CAUTION: this function assumes normalized x and no missing values.
.alignedFirstPC <- function(x, power = 6, verbose = 2, indent = 0) {
x <- as.matrix(x)
# printFlush(paste(".alignedFirstPC: dim(x) = ", paste(dim(x), collapse = ", ")));
pc <- try(svd(x, nu = 1, nv = 0)$u[, 1], silent = TRUE)
if (inherits(pc, "try-error")) {
# file = "alignedFirstPC-svdError-inputData-x.RData";
# save(x, file = file);
# stop(paste("Error in .alignedFirstPC: svd failed with following message: \n ",
# pc, "\n. Saving the offending data into file", file));
if (verbose > 0) {
spaces <- indentSpaces(indent)
printFlush(paste(
spaces, ".alignedFirstPC: FYI: svd failed, using a weighted mean instead.\n",
spaces, " ...svd reported:", pc
))
}
pc <- rowMeans(x, na.rm = TRUE)
weight <- matrix(abs(cor(x, pc, use = "p"))^power, nrow(x), ncol(x), byrow = TRUE)
pc <- scale(rowMeans(x * weight, na.rm = TRUE))
} else {
weight <- abs(cor(x, pc))^power
meanX <- rowWeightedMeans(x, weight, na.rm = TRUE)
cov1 <- cov(pc, meanX)
if (!is.finite(cov1)) cov1 <- 0
if (cov1 < 0) pc <- -pc
}
pc
}
# ==========================================================================================================
#
# Branch eigengene split (dissimilarity) calculation
#
# ==========================================================================================================
# Assumes correct input: multiExpr is scaled to mean 0 and variance 1, branch1 and branch2 are numeric
# indices that have no overlap.
branchEigengeneDissim <- function(expr, branch1, branch2,
corFnc = cor, corOptions = list(use = "p"),
signed = TRUE, ...) {
expr.branch1 <- expr[, branch1]
expr.branch2 <- expr[, branch2]
corOptions$x <- .alignedFirstPC(expr.branch1, verbose = 0)
corOptions$y <- .alignedFirstPC(expr.branch2, verbose = 0)
corFnc <- match.fun(corFnc)
cor0 <- as.numeric(do.call(corFnc, corOptions))
if (length(cor0) != 1) stop("Internal error in branchEigengeneDissim: cor has length ", length(cor0))
if (signed) 1 - cor0 else 1 - abs(cor0)
}
mtd.branchEigengeneDissim <- function(multiExpr, branch1, branch2,
corFnc = cor, corOptions = list(use = "p"),
consensusQuantile = 0, signed = TRUE, reproduceQuantileError = FALSE, ...) {
setSplits.list <- mtd.apply(multiExpr, branchEigengeneDissim,
branch1 = branch1, branch2 = branch2,
corFnc = corFnc, corOptions = corOptions,
signed = signed,
returnList = TRUE
)
setSplits <- unlist(setSplits.list)
quantile(setSplits,
prob = if (reproduceQuantileError) consensusQuantile else 1 - consensusQuantile,
na.rm = TRUE, names = FALSE
)
}
branchEigengeneSimilarity <- function(expr, branch1, branch2,
networkOptions,
returnDissim = TRUE,
...) {
corFnc <- match.fun(networkOptions$corFnc)
cor0 <- as.numeric(do.call(
corFnc,
c(
list(
x = .alignedFirstPC(expr[, branch1], verbose = 0),
y = .alignedFirstPC(expr[, branch2], verbose = 0)
),
networkOptions$corOptions
)
))
if (length(cor0) != 1) stop("Internal error in branchEigengeneDissim: cor has length ", length(cor0))
if (grepl("signed", networkOptions$networkType)) cor0 <- abs(cor0)
if (returnDissim) 1 - cor0 else cor0
}
hierarchicalBranchEigengeneDissim <- function(
multiExpr,
branch1, branch2,
networkOptions,
consensusTree, ...) {
setSplits.list <- mtd.mapply(branchEigengeneSimilarity,
expr = multiExpr, networkOptions = networkOptions,
MoreArgs = list(
branch1 = branch1, branch2 = branch2,
returnDissim = FALSE
), returnList = TRUE
)
1 - simpleHierarchicalConsensusCalculation(
individualData = setSplits.list,
consensusTree = consensusTree
)
}
# ==========================================================================================================
#
# Branch split calculation
#
# ==========================================================================================================
# Caution: highly experimental!
# Define a function that's supposed to decide whether two given groups of expression data are part of a
# single module, or truly two independent modules.
# assumes no missing values for now.
# assumes all data is scaled to mean zero and equal variance.
# return: criterion is zero or near zero if it looks like a single module, and is near 1 if looks like two
# modules.
# Careful: in the interest of speedy execution, the function doesn't check arguments for validity. For
# example, it assumes that expr is already scaled to the same mean and variance, branch1 and branch2 are valid
# indices, nConsideredPCs does not exceed any of the dimensions of expr etc.
.histogramsWithCommonBreaks <- function(data, groups, discardProp = 0.08) {
if (is.list(data)) {
lengths <- sapply(data, length)
data <- data[lengths > 0]
lengths <- sapply(data, length)
nGroups <- length(lengths)
groups <- rep(c(1:nGroups), lengths)
data <- unlist(data)
}
if (discardProp > 0) {
# get rid of outliers on either side - those are most likely not interesting.
# The code is somewhat involved because I want to get rid of outliers that are defined with respect to
# the combined data, but no more than a certain proportion of either of the groups.
sizes <- table(groups)
nAll <- length(data)
order <- order(data)
ordGrp <- groups[order]
cs <- rep(0, nAll)
nGroups <- length(sizes)
for (g in 1:nGroups) {
cs[ordGrp == g] <- ((1:sizes[g]) - 0.5) / sizes[g]
}
firstKeep <- min(which(cs > discardProp))
first <- data[order[firstKeep]]
# Analogous upper quantile
lastKeep <- max(which(cs < 1 - discardProp))
last <- data[order[lastKeep]]
keep <- ((data >= first) & (data <= last))
data <- data[keep]
groups <- groups[keep]
} else {
last <- max(data, na.rm = TRUE)
first <- min(data, na.rm = TRUE)
}
# Find the smaller of the two groups and define histogram bin size from the number of elements in that
# group; the aim is to prevent the group getting splintered into too many bins of the histogram.
sizes <- table(groups)
smallerInd <- which.min(sizes)
smallerSize <- sizes[smallerInd]
nBins <- ceiling(5 + ifelse(smallerSize > 25, sqrt(smallerSize) - 4, 1))
smaller <- data[groups == smallerInd]
binSize <- (max(smaller) - min(smaller)) / nBins
nAllBins <- ceiling((last - first) / binSize)
breaks <- first + c(0:nAllBins) * (last - first) / nAllBins
tapply(data, groups, hist, breaks = breaks, plot = FALSE)
}
branchSplit <- function(expr, branch1, branch2, discardProp = 0.05, minCentralProp = 0.75,
nConsideredPCs = 3, signed = FALSE, getDetails = TRUE, ...) {
nGenes <- c(length(branch1), length(branch2))
# combined = cbind(expr[, branch1], expr[, branch2]);
combinedScaled <- cbind(expr[, branch1] / sqrt(length(branch1)), expr[, branch2] / sqrt(length(branch2)))
groups <- c(rep(1, nGenes[1]), rep(2, nGenes[2]))
# get the combination of PCs that best approximates the groups vector
svd <- svd(combinedScaled, nu = 0, nv = nConsideredPCs)
v2 <- svd$v * c(rep(sqrt(length(branch1)), length(branch1)), rep(sqrt(length(branch2)), length(branch2)))
# svd = svd(combinedScaled, nu = nConsideredPCs, nv = 0);
# v2 = cor(combinedScaled, svd$u);
if (!signed) v2 <- v2 * sign(v2[, 1])
cor2 <- predict(lm(groups ~ ., data = as.data.frame(v2)))
# get the histograms of the projections in both groups, but make sure the binning is the same for both.
# get rid of outliers on either side - those are most likely not interesting.
# The code is somewhat involved because I want to get rid of outliers that are defined with respect to
# the combined data, but no more than a certain proportion of either of the groups.
h <- .histogramsWithCommonBreaks(cor2, groups, discardProp)
maxAll <- max(c(h[[1]]$counts, h[[2]]$counts))
h[[1]]$counts <- h[[1]]$counts / maxAll
h[[2]]$counts <- h[[2]]$counts / maxAll
max1 <- max(h[[1]]$counts)
max2 <- max(h[[2]]$counts)
minMax <- min(max1, max2)
if (FALSE) {
plot(h[[1]]$mids, h[[1]]$counts, type = "l")
lines(h[[2]]$mids, h[[2]]$counts, type = "l", col = "red")
lines(h[[2]]$mids, h[[1]]$counts + h[[2]]$counts, type = "l", col = "blue")
}
# Locate "central" bins: those whose scaled counts exceed a threshold.
central <- list()
central[[1]] <- h[[1]]$counts > minCentralProp * minMax
central[[2]] <- h[[2]]$counts > minCentralProp * minMax
# Do central bins overlap?
overlap <- (min(h[[1]]$mids[central[[1]]]) <= max(h[[2]]$mids[central[[2]]])) &
(min(h[[2]]$mids[central[[2]]]) <= max(h[[1]]$mids[central[[1]]]))
if (overlap) {
result <- list(middleCounts = NULL, criterion = minCentralProp, split = -1, histograms = h)
} else {
# Locate the region between the two central regions and check whether the gap is deep enough.
if (min(h[[1]]$mids[central[[1]]]) > max(h[[2]]$mids[central[[2]]])) {
left <- 2
right <- 1
} else {
left <- 1
right <- 2
}
leftEdge <- max(h[[left]]$mids[central[[left]]])
rightEdge <- min(h[[right]]$mids[central[[right]]])
middle <- ((h[[left]]$mids > leftEdge) & (h[[left]]$mids < rightEdge))
nMiddle <- sum(middle)
if (nMiddle == 0) {
result <- list(middleCounts = NULL, criterion = minCentralProp, split = -1, histograms = h)
} else {
# Reference level: 75th percentile of the central region of the smaller branch
# refLevel1 = quantile(h[[1]]$counts [ central[[1]] ], prob = 0.75);
# refLevel2 = quantile(h[[2]]$counts [ central[[2]] ], prob = 0.75)
refLevel1 <- mean(h[[1]]$counts [central[[1]]], na.rm = TRUE)
refLevel2 <- mean(h[[2]]$counts [central[[2]]], na.rm = TRUE)
peakRefLevel <- min(refLevel1, refLevel2)
middleCounts <- h[[left]]$counts[middle] + h[[right]]$counts[middle]
# troughRefLevel = quantile(middleCounts, prob = 0.25)
troughRefLevel <- mean(middleCounts, na.rm = TRUE)
meanCorrFactor <- sqrt(min(nMiddle + 1, 3) / min(nMiddle, 3))
# =sqrt(2, 3/2, 1), for nMiddle=1,2,3,..
result <- list(
middleCounts = middleCounts,
criterion = troughRefLevel * meanCorrFactor,
split = (peakRefLevel - troughRefLevel * meanCorrFactor) / peakRefLevel,
histograms = h
)
}
}
if (getDetails) result else result$split
}
# ==========================================================================================================
#
# Dissimilarity-based branch split
#
# ==========================================================================================================
.meanInRange <- function(mat, rangeMat) {
nc <- ncol(mat)
means <- rep(0, nc)
for (c in 1:nc)
{
col <- mat[, c]
means[c] <- mean(col[col >= rangeMat[c, 1] & col <= rangeMat[c, 2]], na.rm = TRUE)
}
means
}
.sizeDependentQuantile <- function(p, sizes, minNumber = 5) {
refSize <- minNumber / p
correctionFactor <- pmin(rep(1, length(sizes)), sizes / refSize)
pmin(rep(1, length(sizes)), p / correctionFactor)
}
branchSplit.dissim <- function(dissimMat, branch1, branch2, upperP,
minNumberInSplit = 5, getDetails = FALSE, ...) {
lowerP <- 0
sizes <- c(length(branch1), length(branch2))
upperP <- .sizeDependentQuantile(upperP, sizes, minNumber = minNumberInSplit)
multiP <- as.data.frame(rbind(rep(0, 2), upperP))
outDissim <- list(
list(data = dissimMat[branch2, branch1]),
list(data = dissimMat[branch1, branch2])
)
quantiles <- mtd.mapply(colQuantiles, outDissim, probs = multiP, MoreArgs = list(drop = FALSE))
averages <- mtd.mapply(.meanInRange, outDissim, quantiles)
averageQuantiles <- mtd.mapply(quantile, averages, prob = multiP, MoreArgs = list(drop = FALSE))
betweenQuantiles <- mtd.mapply(
function(x, quantiles) {
x >= quantiles[1] & x <= quantiles[2]
},
averages, averageQuantiles
)
selectedDissim <- list(
list(data = dissimMat[branch1, branch1[betweenQuantiles[[1]]$data]]),
list(data = dissimMat[branch2, branch1[betweenQuantiles[[1]]$data]]),
list(data = dissimMat[branch2, branch2[betweenQuantiles[[2]]$data]]),
list(data = dissimMat[branch1, branch2[betweenQuantiles[[2]]$data]])
)
# n1 = length(branch1);
# m1 = sum(betweenQuantiles[[1]]$data);
# indexMat = cbind((1:n1)[betweenQuantiles[[1]]$data], 1:m1);
# Remove the points nearest to branch 2 from the distances in branch 1
selectedDissim[[1]]$data[betweenQuantiles[[1]]$data, ] <- NA
# n2 = length(branch2);
# m2 = sum(betweenQuantiles[[2]]$data);
# indexMat = cbind((1:n2)[betweenQuantiles[[2]]$data], 1:m2);
selectedDissim[[3]]$data[betweenQuantiles[[2]]$data, ] <- NA
multiP.ext <- cbind(multiP, multiP[, c(2, 1)])
selectedDissimQuantiles <- mtd.mapply(colQuantiles, selectedDissim,
probs = multiP.ext,
MoreArgs = list(drop = FALSE, na.rm = TRUE)
)
selectedAverages <- mtd.mapply(.meanInRange, selectedDissim, selectedDissimQuantiles)
if (FALSE) {
par(mfrow = c(1, 2))
verboseBoxplot(c(selectedAverages[[1]]$data, selectedAverages[[2]]$data),
c(
rep("in", length(selectedAverages[[1]]$data)),
rep("out", length(selectedAverages[[2]]$data))
),
main = "branch 1",
xlab = "", ylab = "mean distance"
)
verboseBoxplot(c(selectedAverages[[3]]$data, selectedAverages[[4]]$data),
c(
rep("in", length(selectedAverages[[3]]$data)),
rep("out", length(selectedAverages[[4]]$data))
),
main = "branch 2",
xlab = "", ylab = "mean distance"
)
}
separation <- function(x, y) {
nx <- length(x)
ny <- length(y)
if (nx * ny == 0) {
return(0)
}
mx <- mean(x, na.rm = TRUE)
my <- mean(y, na.rm = TRUE)
if (!is.finite(mx) | !is.finite(my)) {
return(0)
}
if (nx > 1) varx <- var(x, na.rm = TRUE) else varx <- 0
if (ny > 0) vary <- var(y, na.rm = TRUE) else vary <- 0
if (is.na(varx)) varx <- 0
if (is.na(vary)) vary <- 0
if (varx + vary == 0) {
if (my == mx) {
return(0)
} else {
return(Inf)
}
}
out <- abs(my - mx) / (sqrt(varx) + sqrt(vary))
if (is.na(out)) out <- 0
out
}
separations <- c(
separation(selectedAverages[[1]]$data, selectedAverages[[2]]$data),
separation(selectedAverages[[3]]$data, selectedAverages[[4]]$data)
)
out <- max(separations, na.rm = TRUE)
if (is.na(out)) out <- 0
if (out < 0) out <- 0
if (getDetails) {
return(list(
split = out,
distances = list(
within1 = selectedAverages[[1]]$data,
from1to2 = selectedAverages[[2]]$data,
within2 = selectedAverages[[3]]$data,
from2to1 = selectedAverages[[4]]$data
)
))
}
out
}
# ========================================================================================================
#
# Branch dissimilarity based on a series of alternate branch labels
#
# ========================================================================================================
# this function measures the split of branch1 and branch2 based on alternate labels, typically derived from
# resampled or otherwise perturbed data (but could also be derived from an independent data set).
# Basic idea: if two branches are separate, their membership should be predictable from the alternate
# labels.
# This function takes the l-th stability labels, finds ones that overlap with both branches, and for each
# label calculates the contribution to similarity as
# r1 = sum(lab1==cl)/n1;
# r2 = sum(lab2==cl)/n2;
# sim = sim + min(r1, r2)
# This will penalize similarity of a small and large module if the large module is a composite of several
# branches, only few of which overlap with the small module.
# This method is invariant under splitting of alternate module as long as the branch to which the modules are
# assigned does not change. So in this sense the splitting settings in the resampling study shouldn't
# matter too much but to some degree they still do.
# stabilityLabels: a matrix of dimensions (nGenes) x (number of alternate labels)
branchSplitFromStabilityLabels <- function(
branch1, branch2,
stabilityLabels, ignoreLabels = 0, ...) {
nLabels <- ncol(stabilityLabels)
n1 <- length(branch1)
n2 <- length(branch2)
sim <- 0
# browser()
for (l in 1:nLabels)
{
lab1 <- stabilityLabels[branch1, l]
lab2 <- stabilityLabels[branch2, l]
commonLevels <- intersect(unique(lab1), unique(lab2))
commonLevels <- setdiff(commonLevels, ignoreLabels)
if (length(commonLevels) > 0) {
for (cl in commonLevels)
{
# printFlush(spaste("Common level ", cl, " in clustering ", l))
r1 <- sum(lab1 == cl) / n1
r2 <- sum(lab2 == cl) / n2
sim <- sim + min(r1, r2)
}
}
}
# printFlush(spaste("branchSplitFromStabilityLabels: returning ", 1-sim/nLabels))
1 - sim / nLabels
}
# Resurrect the old idea of prediction
# accuracy. For each overlap module, count the genes in the branch with which the module has smaller overlap
# and add it to the score for that branch. The final counts divided by number of genes on branch give a
# "indistinctness" score; take the larger of the the two indistinctness scores and call this the similarity.
# This method is still more or less invariant under splitting of the stability modules, as long as the
# splitting is random with respect to the two branches.
## Note that one could in principle run a chisq.test on the table of labels corresponding to branch1 and
## branch2 vs. stabilityLabels restricted to branch1 and branch2,
# The problem here is that small changes in stability labels could make a big difference in final
# (dis)similarity when one module is large and the other small. Assume a few of the stability labels cover
# small and a part of the large module; other stability labels cover the rest of the large module. If the
# common stability labels cover a bit more of large than small module, similarity will be high; if they
# switch more to the smaller module, similarity could be near zero.
# In summary, this function may be used for experiments but should not be used in production setting.
branchSplitFromStabilityLabels.prediction <- function(
branch1, branch2,
stabilityLabels, ignoreLabels = 0, ...) {
nLabels <- ncol(stabilityLabels)
n1 <- length(branch1)
n2 <- length(branch2)
s1 <- s2 <- 0
for (l in 1:nLabels)
{
lab1 <- stabilityLabels[branch1, l]
lab2 <- stabilityLabels[branch2, l]
commonLevels <- intersect(lab1, lab2)
commonLevels <- setdiff(commonLevels, ignoreLabels)
if (length(commonLevels) > 0) {
for (cl in commonLevels)
{
# printFlush(spaste("Common level ", cl, " in clustering ", l))
o1 <- sum(lab1 == cl)
o2 <- sum(lab2 == cl)
if (o1 > o2) {
s2 <- s2 + o2
} else {
s1 <- s1 + o1
}
}
}
}
indist1 <- s1 / (n1 * nLabels)
indist2 <- s2 / (n2 * nLabels)
sim <- min(1, 2 * max(indist1, indist2))
dissim <- 1 - sim
# printFlush(spaste("branchSplitFromStabilityLabels.prediction: returning ", dissim))
dissim
}
# Third idea: for each branch, for each gene sum the fraction of the stability label (restricted to the two
# branches) that belongs to the branch. If this is relatively low, around 0.5, it means most elements are in
# non-discriminative stability labels.
branchSplitFromStabilityLabels.individualFraction <- function(
branch1, branch2,
stabilityLabels, ignoreLabels = 0, ...) {
nLabels <- ncol(stabilityLabels)
n1 <- length(branch1)
n2 <- length(branch2)
s1 <- s2 <- 0
for (l in 1:nLabels)
{
lab1 <- stabilityLabels[branch1, l]
lab2 <- stabilityLabels[branch2, l]
commonLevels <- intersect(lab1, lab2)
commonLevels <- setdiff(commonLevels, ignoreLabels)
s1.all <- n1
s2.all <- n2
for (cl in commonLevels)
{
# printFlush(spaste("Common level ", cl, " in clustering ", l))
o1 <- sum(lab1 == cl, na.rm = TRUE)
o2 <- sum(lab2 == cl, na.rm = TRUE)
o12 <- o1 + o2
coef1 <- max(0.5, o1 / o12)
coef2 <- max(0.5, o2 / o12)
s2 <- s2 + o2 * coef2
s1 <- s1 + o1 * coef1
s1.all <- s1.all - o1
s2.all <- s2.all - o2
}
s1 <- s1 + s1.all
s2 <- s2 + s2.all
# if (is.na(s1) | is.na(s2)) browser();
}
distinctness1 <- 2 * s1 / (n1 * nLabels) - 1
distinctenss2 <- 2 * s2 / (n2 * nLabels) - 1
dissim <- min(distinctness1, distinctenss2)
printFlush(spaste("branchSplitFromStabilityLabels.individualFraction: returning ", dissim))
dissim
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.