#======================================================================================================
#
# Aligned svd: svd plus aligning the result along the average expression of the input data.
#
#======================================================================================================
# CAUTION: this function assumes normalized x and no missing values.
.alignedFirstPC = function(x, power = 6, verbose = 2, indent = 0)
{
x = as.matrix(x);
#printFlush(paste(".alignedFirstPC: dim(x) = ", paste(dim(x), collapse = ", ")));
pc = try( svd(x, nu = 1, nv = 0)$u[,1] , silent = TRUE);
if (inherits(pc, 'try-error'))
{
#file = "alignedFirstPC-svdError-inputData-x.RData";
#save(x, file = file);
#stop(paste("Error in .alignedFirstPC: svd failed with following message: \n ",
# pc, "\n. Saving the offending data into file", file));
if (verbose > 0)
{
spaces = indentSpaces(indent);
printFlush(paste(spaces, ".alignedFirstPC: FYI: svd failed, using a weighted mean instead.\n",
spaces, " ...svd reported:", pc))
}
pc = rowMeans(x, na.rm = TRUE);
weight = matrix(abs(cor(x, pc, use = 'p'))^power, nrow(x), ncol(x), byrow = TRUE);
pc = scale(rowMeans(x * weight, na.rm = TRUE));
} else {
weight = abs(cor(x, pc))^power;
meanX = rowWeightedMeans(x, weight, na.rm = TRUE);
cov1 = cov(pc, meanX);
if (!is.finite(cov1)) cov1 = 0;
if (cov1 < 0) pc = -pc;
}
pc;
}
#==========================================================================================================
#
# Branch eigengene split (dissimilarity) calculation
#
#==========================================================================================================
# Assumes correct input: multiExpr is scaled to mean 0 and variance 1, branch1 and branch2 are numeric
# indices that have no overlap.
branchEigengeneDissim = function(expr, branch1, branch2,
corFnc = cor, corOptions = list(use = 'p'),
signed = TRUE, ...)
{
expr.branch1 = expr[ , branch1];
expr.branch2 = expr[ , branch2];
corOptions$x = .alignedFirstPC(expr.branch1, verbose = 0);
corOptions$y = .alignedFirstPC(expr.branch2, verbose = 0);
corFnc = match.fun(corFnc);
cor0 = as.numeric(do.call(corFnc, corOptions));
if (length(cor0) != 1) stop ("Internal error in branchEigengeneDissim: cor has length ", length(cor0));
if (signed) 1-cor0 else 1-abs(cor0);
}
mtd.branchEigengeneDissim = function(multiExpr, branch1, branch2,
corFnc = cor, corOptions = list(use = 'p'),
consensusQuantile = 0, signed = TRUE, reproduceQuantileError = FALSE, ...)
{
setSplits.list = mtd.apply(multiExpr, branchEigengeneDissim,
branch1 = branch1, branch2 = branch2,
corFnc = corFnc, corOptions = corOptions,
signed = signed,
returnList = TRUE);
setSplits = unlist(setSplits.list);
quantile(setSplits, prob = if (reproduceQuantileError) consensusQuantile else 1-consensusQuantile,
na.rm = TRUE, names = FALSE);
}
branchEigengeneSimilarity = function(expr, branch1, branch2,
networkOptions,
returnDissim = TRUE,
...)
{
corFnc = match.fun(networkOptions$corFnc);
cor0 = as.numeric(do.call(corFnc,
c(list(x = .alignedFirstPC(expr[, branch1], verbose = 0),
y = .alignedFirstPC(expr[, branch2], verbose = 0)),
networkOptions$corOptions)));
if (length(cor0) != 1) stop ("Internal error in branchEigengeneDissim: cor has length ", length(cor0));
if (grepl("signed", networkOptions$networkType)) cor0 = abs(cor0);
if (returnDissim) 1-cor0 else cor0;
}
hierarchicalBranchEigengeneDissim = function(
multiExpr,
branch1, branch2,
networkOptions,
consensusTree, ...)
{
setSplits.list = mtd.mapply(branchEigengeneSimilarity,
expr = multiExpr, networkOptions = networkOptions,
MoreArgs = list(
branch1 = branch1, branch2 = branch2,
returnDissim = FALSE), returnList = TRUE)
1 - simpleHierarchicalConsensusCalculation(individualData = setSplits.list,
consensusTree = consensusTree)
}
#==========================================================================================================
#
# Branch split calculation
#
#==========================================================================================================
# Caution: highly experimental!
# Define a function that's supposed to decide whether two given groups of expression data are part of a
# single module, or truly two independent modules.
# assumes no missing values for now.
# assumes all data is scaled to mean zero and equal variance.
# return: criterion is zero or near zero if it looks like a single module, and is near 1 if looks like two
# modules.
# Careful: in the interest of speedy execution, the function doesn't check arguments for validity. For
# example, it assumes that expr is already scaled to the same mean and variance, branch1 and branch2 are valid
# indices, nConsideredPCs does not exceed any of the dimensions of expr etc.
.histogramsWithCommonBreaks = function(data, groups, discardProp = 0.08)
{
if (is.list(data))
{
lengths = sapply(data, length);
data = data[lengths>0];
lengths = sapply(data, length);
nGroups = length(lengths)
groups = rep( c(1:nGroups), lengths);
data = unlist(data);
}
if (discardProp > 0)
{
# get rid of outliers on either side - those are most likely not interesting.
# The code is somewhat involved because I want to get rid of outliers that are defined with respect to
# the combined data, but no more than a certain proportion of either of the groups.
sizes = table(groups);
nAll = length(data);
order = order(data)
ordGrp = groups[order];
cs = rep(0, nAll);
nGroups = length(sizes);
for (g in 1:nGroups)
cs[ordGrp==g] = ((1:sizes[g])-0.5)/sizes[g];
firstKeep = min(which(cs > discardProp));
first = data[order[firstKeep]];
# Analogous upper quantile
lastKeep = max(which(cs < 1-discardProp));
last = data[order[lastKeep]];
keep = ( (data >= first) & (data <= last) );
data = data[keep];
groups = groups[keep];
} else {
last = max(data, na.rm = TRUE);
first = min(data, na.rm = TRUE);
}
# Find the smaller of the two groups and define histogram bin size from the number of elements in that
# group; the aim is to prevent the group getting splintered into too many bins of the histogram.
sizes = table(groups);
smallerInd = which.min(sizes);
smallerSize = sizes[smallerInd];
nBins = ceiling(5 + ifelse(smallerSize > 25, sqrt(smallerSize)-4, 1 ));
smaller = data[groups==smallerInd]
binSize = (max(smaller) - min(smaller))/nBins;
nAllBins = ceiling((last-first)/binSize);
breaks = first + c(0:nAllBins) * (last - first)/nAllBins
tapply(data, groups, hist, breaks = breaks, plot = FALSE);
}
branchSplit = function(expr, branch1, branch2, discardProp = 0.05, minCentralProp = 0.75,
nConsideredPCs = 3, signed = FALSE, getDetails = TRUE, ...)
{
nGenes = c(length(branch1), length(branch2));
#combined = cbind(expr[, branch1], expr[, branch2]);
combinedScaled = cbind(expr[, branch1]/sqrt(length(branch1)), expr[, branch2]/sqrt(length(branch2)));
groups = c(rep(1, nGenes[1]), rep(2, nGenes[2]) );
# get the combination of PCs that best approximates the groups vector
svd = svd(combinedScaled, nu = 0, nv = nConsideredPCs);
v2 = svd$v * c( rep(sqrt(length(branch1)), length(branch1)), rep(sqrt(length(branch2)), length(branch2)));
#svd = svd(combinedScaled, nu = nConsideredPCs, nv = 0);
#v2 = cor(combinedScaled, svd$u);
if (!signed) v2 = v2 * sign(v2[, 1]);
cor2 = predict(lm(groups~., data = as.data.frame(v2)));
# get the histograms of the projections in both groups, but make sure the binning is the same for both.
# get rid of outliers on either side - those are most likely not interesting.
# The code is somewhat involved because I want to get rid of outliers that are defined with respect to
# the combined data, but no more than a certain proportion of either of the groups.
h = .histogramsWithCommonBreaks(cor2, groups, discardProp);
maxAll = max(c(h[[1]]$counts, h[[2]]$counts));
h[[1]]$counts = h[[1]]$counts/maxAll
h[[2]]$counts = h[[2]]$counts/maxAll;
max1 = max(h[[1]]$counts)
max2 = max(h[[2]]$counts)
minMax = min(max1, max2)
if (FALSE) {
plot(h[[1]]$mids, h[[1]]$counts, type = "l");
lines(h[[2]]$mids, h[[2]]$counts, type = "l", col = "red")
lines(h[[2]]$mids, h[[1]]$counts + h[[2]]$counts, type = "l", col = "blue")
}
# Locate "central" bins: those whose scaled counts exceed a threshold.
central = list();
central[[1]] = h[[1]]$counts > minCentralProp * minMax;
central[[2]] = h[[2]]$counts > minCentralProp * minMax;
# Do central bins overlap?
overlap = (min(h[[1]]$mids[central[[1]]]) <= max(h[[2]]$mids[central[[2]]])) &
(min(h[[2]]$mids[central[[2]]]) <= max(h[[1]]$mids[central[[1]]]));
if (overlap)
{
result = list(middleCounts = NULL, criterion = minCentralProp, split = -1, histograms = h);
} else {
# Locate the region between the two central regions and check whether the gap is deep enough.
if (min(h[[1]]$mids[central[[1]]]) > max(h[[2]]$mids[central[[2]]]))
{
left = 2; right = 1;
} else {
left = 1; right = 2;
}
leftEdge = max(h[[left]]$mids[central[[left]]]);
rightEdge = min(h[[right]]$mids[central[[right]]]);
middle = ( (h[[left]]$mids > leftEdge) & (h[[left]]$mids < rightEdge) );
nMiddle = sum(middle);
if (nMiddle==0)
{
result = list(middleCounts = NULL, criterion = minCentralProp, split = -1, histograms = h);
} else {
# Reference level: 75th percentile of the central region of the smaller branch
#refLevel1 = quantile(h[[1]]$counts [ central[[1]] ], prob = 0.75);
#refLevel2 = quantile(h[[2]]$counts [ central[[2]] ], prob = 0.75)
refLevel1 = mean(h[[1]]$counts [ central[[1]] ], na.rm = TRUE);
refLevel2 = mean(h[[2]]$counts [ central[[2]] ], na.rm = TRUE)
peakRefLevel = min(refLevel1, refLevel2);
middleCounts = h[[left]]$counts[middle] + h[[right]]$counts[middle];
#troughRefLevel = quantile(middleCounts, prob = 0.25)
troughRefLevel = mean(middleCounts, na.rm = TRUE)
meanCorrFactor = sqrt(min(nMiddle + 1, 3) / min(nMiddle, 3))
# =sqrt(2, 3/2, 1), for nMiddle=1,2,3,..
result = list(middleCounts = middleCounts,
criterion = troughRefLevel * meanCorrFactor,
split = (peakRefLevel - troughRefLevel * meanCorrFactor)/peakRefLevel,
histograms = h);
}
}
if (getDetails) result else result$split
}
#==========================================================================================================
#
# Dissimilarity-based branch split
#
#==========================================================================================================
.meanInRange = function(mat, rangeMat)
{
nc = ncol(mat);
means = rep(0, nc);
for (c in 1:nc)
{
col = mat[, c];
means[c] = mean( col[col >=rangeMat[c, 1] & col <= rangeMat[c, 2]], na.rm= TRUE);
}
means;
}
.sizeDependentQuantile = function(p, sizes, minNumber = 5)
{
refSize = minNumber/p;
correctionFactor = pmin( rep(1, length(sizes)), sizes/refSize);
pmin(rep(1, length(sizes)), p/correctionFactor);
}
branchSplit.dissim = function(dissimMat, branch1, branch2, upperP,
minNumberInSplit = 5, getDetails = FALSE, ...)
{
lowerP = 0;
sizes = c(length(branch1), length(branch2));
upperP = .sizeDependentQuantile(upperP, sizes, minNumber = minNumberInSplit);
multiP = as.data.frame(rbind(rep(0, 2), upperP));
outDissim = list(list(data = dissimMat[branch2, branch1]),
list(data = dissimMat[branch1, branch2]));
quantiles = mtd.mapply(colQuantiles, outDissim, probs = multiP, MoreArgs = list(drop = FALSE));
averages = mtd.mapply(.meanInRange, outDissim, quantiles);
averageQuantiles = mtd.mapply(quantile, averages, prob = multiP, MoreArgs = list(drop = FALSE));
betweenQuantiles = mtd.mapply(function(x, quantiles) { x>=quantiles[1] & x <=quantiles[2]},
averages, averageQuantiles);
selectedDissim = list(list(data = dissimMat[branch1, branch1[betweenQuantiles[[1]]$data] ]),
list(data = dissimMat[branch2, branch1[ betweenQuantiles[[1]]$data] ]),
list(data = dissimMat[branch2, branch2[betweenQuantiles[[2]]$data]]),
list(data = dissimMat[branch1, branch2[betweenQuantiles[[2]]$data]]));
#n1 = length(branch1);
#m1 = sum(betweenQuantiles[[1]]$data);
#indexMat = cbind((1:n1)[betweenQuantiles[[1]]$data], 1:m1);
# Remove the points nearest to branch 2 from the distances in branch 1
selectedDissim[[1]]$data[ betweenQuantiles[[1]]$data, ] = NA;
#n2 = length(branch2);
#m2 = sum(betweenQuantiles[[2]]$data);
#indexMat = cbind((1:n2)[betweenQuantiles[[2]]$data], 1:m2);
selectedDissim[[3]]$data[ betweenQuantiles[[2]]$data, ] = NA;
multiP.ext = cbind(multiP, multiP[, c(2,1)]);
selectedDissimQuantiles = mtd.mapply(colQuantiles, selectedDissim, probs = multiP.ext,
MoreArgs = list( drop = FALSE, na.rm = TRUE));
selectedAverages = mtd.mapply(.meanInRange, selectedDissim, selectedDissimQuantiles);
if (FALSE)
{
par(mfrow = c(1,2))
verboseBoxplot(c(selectedAverages[[1]]$data, selectedAverages[[2]]$data),
c( rep("in", length(selectedAverages[[1]]$data)),
rep("out", length(selectedAverages[[2]]$data))), main = "branch 1",
xlab = "", ylab = "mean distance")
verboseBoxplot(c(selectedAverages[[3]]$data, selectedAverages[[4]]$data),
c( rep("in", length(selectedAverages[[3]]$data)),
rep("out", length(selectedAverages[[4]]$data))), main = "branch 2",
xlab = "", ylab = "mean distance")
}
separation = function(x, y)
{
nx = length(x);
ny = length(y);
if (nx*ny==0) return(0);
mx = mean(x, na.rm = TRUE);
my = mean(y, na.rm = TRUE);
if (!is.finite(mx) | !is.finite(my)) return(0);
if (nx > 1) varx = var(x, na.rm = TRUE) else varx = 0;
if (ny > 0) vary = var(y, na.rm = TRUE) else vary = 0;
if (is.na(varx)) varx = 0;
if (is.na(vary)) vary = 0;
if (varx + vary == 0)
{
if (my==mx) return(0) else return(Inf);
}
out = abs(my-mx)/(sqrt(varx) + sqrt(vary));
if (is.na(out)) out = 0;
out
}
separations = c(separation(selectedAverages[[1]]$data, selectedAverages[[2]]$data),
separation(selectedAverages[[3]]$data, selectedAverages[[4]]$data));
out = max(separations, na.rm = TRUE);
if (is.na(out)) out = 0;
if (out < 0) out = 0;
if (getDetails)
{
return(list(split = out,
distances = list(within1 = selectedAverages[[1]]$data,
from1to2 = selectedAverages[[2]]$data,
within2 = selectedAverages[[3]]$data,
from2to1 = selectedAverages[[4]]$data)));
}
out
}
#========================================================================================================
#
# Branch dissimilarity based on a series of alternate branch labels
#
#========================================================================================================
# this function measures the split of branch1 and branch2 based on alternate labels, typically derived from
# resampled or otherwise perturbed data (but could also be derived from an independent data set).
# Basic idea: if two branches are separate, their membership should be predictable from the alternate
# labels.
# This function takes the l-th stability labels, finds ones that overlap with both branches, and for each
# label calculates the contribution to similarity as
# r1 = sum(lab1==cl)/n1;
# r2 = sum(lab2==cl)/n2;
# sim = sim + min(r1, r2)
# This will penalize similarity of a small and large module if the large module is a composite of several
# branches, only few of which overlap with the small module.
# This method is invariant under splitting of alternate module as long as the branch to which the modules are
# assigned does not change. So in this sense the splitting settings in the resampling study shouldn't
# matter too much but to some degree they still do.
# stabilityLabels: a matrix of dimensions (nGenes) x (number of alternate labels)
branchSplitFromStabilityLabels = function(
branch1, branch2,
stabilityLabels, ignoreLabels = 0, ...)
{
nLabels = ncol(stabilityLabels);
n1 = length(branch1);
n2 = length(branch2);
sim = 0;
#browser()
for (l in 1:nLabels)
{
lab1 = stabilityLabels[branch1, l];
lab2 = stabilityLabels[branch2, l];
commonLevels = intersect(unique(lab1), unique(lab2));
commonLevels = setdiff(commonLevels, ignoreLabels);
if (length(commonLevels) > 0) for (cl in commonLevels)
{
#printFlush(spaste("Common level ", cl, " in clustering ", l))
r1 = sum(lab1==cl)/n1;
r2 = sum(lab2==cl)/n2;
sim = sim + min(r1, r2)
}
}
#printFlush(spaste("branchSplitFromStabilityLabels: returning ", 1-sim/nLabels))
1-sim/nLabels
}
# Resurrect the old idea of prediction
# accuracy. For each overlap module, count the genes in the branch with which the module has smaller overlap
# and add it to the score for that branch. The final counts divided by number of genes on branch give a
# "indistinctness" score; take the larger of the the two indistinctness scores and call this the similarity.
# This method is still more or less invariant under splitting of the stability modules, as long as the
# splitting is random with respect to the two branches.
## Note that one could in principle run a chisq.test on the table of labels corresponding to branch1 and
## branch2 vs. stabilityLabels restricted to branch1 and branch2,
# The problem here is that small changes in stability labels could make a big difference in final
# (dis)similarity when one module is large and the other small. Assume a few of the stability labels cover
# small and a part of the large module; other stability labels cover the rest of the large module. If the
# common stability labels cover a bit more of large than small module, similarity will be high; if they
# switch more to the smaller module, similarity could be near zero.
# In summary, this function may be used for experiments but should not be used in production setting.
branchSplitFromStabilityLabels.prediction = function(
branch1, branch2,
stabilityLabels, ignoreLabels = 0, ...)
{
nLabels = ncol(stabilityLabels);
n1 = length(branch1);
n2 = length(branch2);
s1 = s2 = 0;
for (l in 1:nLabels)
{
lab1 = stabilityLabels[branch1, l];
lab2 = stabilityLabels[branch2, l];
commonLevels = intersect(lab1, lab2);
commonLevels = setdiff(commonLevels, ignoreLabels);
if (length(commonLevels) > 0) for (cl in commonLevels)
{
#printFlush(spaste("Common level ", cl, " in clustering ", l))
o1 = sum(lab1==cl);
o2 = sum(lab2==cl);
if (o1 > o2) {
s2 = s2 + o2;
} else
s1 = s1 + o1;
}
}
indist1 = s1/(n1 * nLabels);
indist2 = s2/(n2 * nLabels);
sim = min(1, 2*max(indist1, indist2));
dissim = 1-sim
#printFlush(spaste("branchSplitFromStabilityLabels.prediction: returning ", dissim))
dissim
}
# Third idea: for each branch, for each gene sum the fraction of the stability label (restricted to the two
# branches) that belongs to the branch. If this is relatively low, around 0.5, it means most elements are in
# non-discriminative stability labels.
branchSplitFromStabilityLabels.individualFraction= function(
branch1, branch2,
stabilityLabels, ignoreLabels = 0, ...)
{
nLabels = ncol(stabilityLabels);
n1 = length(branch1);
n2 = length(branch2);
s1 = s2 = 0;
for (l in 1:nLabels)
{
lab1 = stabilityLabels[branch1, l];
lab2 = stabilityLabels[branch2, l];
commonLevels = intersect(lab1, lab2);
commonLevels = setdiff(commonLevels, ignoreLabels);
s1.all = n1; s2.all = n2;
for (cl in commonLevels)
{
#printFlush(spaste("Common level ", cl, " in clustering ", l))
o1 = sum(lab1==cl);
o2 = sum(lab2==cl);
o12 = o1 + o2;
coef1 = max(0.5, o1/o12);
coef2 = max(0.5, o2/o12);
s2 = s2 + o2*coef2;
s1 = s1 + o1*coef1;
s1.all = s1.all - o1;
s2.all = s2.all - o2;
}
s1 = s1 + s1.all;
s2 = s2 + s2.all;
}
distinctness1 = 2*s1/(n1 * nLabels) -1
distinctenss2 = 2*s2/(n2 * nLabels)-1;
dissim = min(distinctness1, distinctenss2)
printFlush(spaste("branchSplitFromStabilityLabels.individualFraction: returning ", dissim))
dissim
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.