# New multilevel specification of consensus: a hierarchical list.
# The consensus calculation needs to be general enough so it can be used for module merging (consensus
# eigengene network) as well as for calculation of consensus kMEs, and possibly for other stuff as well.
# Consensus specification for a single operation:
# - inputs: a list specifying the input of the consensus. This should be general enough to handle
# blockwise data but also not specific to adjacencies.
# The inut should be a multiData structure, with each
# - calibrationMethod: currently one of "none", "simple quantile", "full quantile"
# - consensusQuantile
# - saveCalibratedIndividualTOMs (= FALSE)
# - calibratedIndividualTOMFilePattern (= "calibratedIndividualTOM-Set%s-Block%b.RData")
# The consensus calculation itself does not need information about correlation type etc; the consensus
# calculation is very general.
# For network analysis applications of the consensus: will also have to keep information about how the
# individual network adjacencies (TOMs) are to be created or were created - correlation type, network type,
# TOM type, correlation options etc.
# So we will keep 2 different types of information around:
# 1. network construction options. A simple list giving the necessary network construction options.
# 2. ConsensusOptions: This will also be a list giving the options but not holding the actual consensus
# inputs or outputs.
# outputs of network construction and consensus construction should be held in separate lists.
# Output of individualTOMs:
# - adjacency data, a list of blockwiseData instances
# - block information
# - network construction options, separately for each adjacency (network construction could be different)
# For consensusTOM:
# Inputs:
# - adjacency data: either from individual TOMs or from other consensus TOMs
# . note that constructing a complicated consensus manually (i.e., using consensus TOMs
# as inputs to higher-level consensus) is of limited use
# since consensus modules would need the full consensus tree anyway.
# - consensus tree
# Not really needed: block information
# outputs:
# - consensus adjacency data
# - copy of consensus options
# - other (diagnostic etc) information
# For consensus modules
# Inputs:
# - undelying expression data
# - optional consensus TOM
# - block information
# - network options for each expression data set
# - consensus tree
.checkPower <- function(power) {
if (any(!is.finite(power)) | (sum(power < 1) > 0) | (sum(power > 50) > 0)) {
stop("power must be between 1 and 50.")
}
}
# ==========================================================================================================
#
# Defining a single consensus operation
#
# ==========================================================================================================
newConsensusTree <- function(consensusOptions = newConsensusOptions(),
inputs,
analysisName = NULL) {
if (!inherits(consensusOptions, "ConsensusOptions")) {
stop("'consensusOptions' must be of class ConsensusOptions.")
}
out <- list(
consensusOptions = consensusOptions,
inputs = inputs,
analysisName = analysisName
)
class(out) <- c("ConsensusTree", class(out))
out
}
consensusTreeInputs <- function(consensusTree, flatten = TRUE) {
out <- lapply(consensusTree$inputs, function(inp) if (inherits(inp, "ConsensusTree")) consensusTreeInputs(inp) else inp)
if (flatten) out <- unlist(out)
out
}
newConsensusOptions <- function(
calibration = c("full quantile", "single quantile", "none"),
# Simple quantile scaling options
calibrationQuantile = 0.95,
sampleForCalibration = TRUE,
sampleForCalibrationFactor = 1000,
# Consensus definition
consensusQuantile = 0,
useMean = FALSE,
setWeights = NULL,
# Name to prevent clashing of files
analysisName = "") {
calibration <- match.arg(calibration)
if (any(!is.finite(setWeights))) stop("Entries of 'setWeights' must all be finite.")
if ((consensusQuantile < 0) | (consensusQuantile > 1)) {
stop("'consensusQuantile' must be between 0 and 1.")
}
out <- list(
calibration = calibration,
calibrationQuantile = calibrationQuantile,
sampleForCalibration = sampleForCalibration,
sampleForCalibrationFactor = sampleForCalibrationFactor,
consensusQuantile = consensusQuantile,
useMean = useMean,
setWeights = setWeights,
analysisName = analysisName
)
class(out) <- c("ConsensusOptions", class(out))
out
}
newCorrelationOptions <- function(
corType = c("pearson", "bicor"),
maxPOutliers = 0.05,
quickCor = 0,
pearsonFallback = "individual",
cosineCorrelation = FALSE,
nThreads = 0,
corFnc = if (corType == "bicor") "bicor" else "cor",
corOptions = c(
list(
use = "p",
cosine = cosineCorrelation,
quick = quickCor,
nThreads = nThreads
),
if (corType == "bicor") {
list(
maxPOutliers = maxPOutliers,
pearsonFallback = pearsonFallback
)
} else {
NULL
}
)) {
if ((maxPOutliers < 0) | (maxPOutliers > 1)) stop("maxPOutliers must be between 0 and 1.")
if (quickCor < 0) stop("quickCor must be positive.")
if (nThreads < 0) stop("nThreads must be positive.")
corType <- match.arg(corType)
if ((maxPOutliers < 0) | (maxPOutliers > 1)) stop("maxPOutliers must be between 0 and 1.")
if (quickCor < 0) stop("quickCor must be positive.")
fallback <- pmatch(pearsonFallback, .pearsonFallbacks)
if (is.na(fallback)) {
stop(paste(
"Unrecognized 'pearsonFallback'. Recognized values are (unique abbreviations of)\n",
paste(.pearsonFallbacks, collapse = ", ")
))
}
out <- list(
corType = corType,
maxPOutliers = maxPOutliers,
quickCor = quickCor,
pearsonFallback = pearsonFallback,
pearsonFallback.code = match(pearsonFallback, .pearsonFallbacks),
cosineCorrelation = cosineCorrelation,
corFnc = corFnc,
corOptions = corOptions,
corType.code = match(corType, .corTypes),
nThreads = nThreads
)
class(out) <- c("CorrelationOptions", class(out))
out
}
newNetworkOptions <- function(
correlationOptions = newCorrelationOptions(),
# Adjacency options
replaceMissingAdjacencies = TRUE,
power = 6,
networkType = c("signed hybrid", "signed", "unsigned"),
checkPower = TRUE,
# Topological overlap options
TOMType = c("signed", "unsigned", "none"),
TOMDenom = c("mean", "min"),
suppressTOMForZeroAdjacencies = FALSE,
# Internal behavior options
useInternalMatrixAlgebra = FALSE) {
if (checkPower) .checkPower(power)
networkType <- match.arg(networkType)
TOMType <- match.arg(TOMType)
TOMDenom <- match.arg(TOMDenom)
out <- c(
correlationOptions,
list(
replaceMissingAdjacencies = replaceMissingAdjacencies,
power = power,
networkType = networkType,
TOMType = TOMType,
TOMDenom = TOMDenom,
networkType.code = match(networkType, .networkTypes),
TOMType.code = match(TOMType, .TOMTypes),
TOMDenom.code = match(TOMDenom, .TOMDenoms),
suppressTOMForZeroAdjacencies = suppressTOMForZeroAdjacencies,
useInternalMatrixAlgebra = useInternalMatrixAlgebra
)
)
class(out) <- c("NetworkOptions", class(correlationOptions))
out
}
# ====================================================================================================
#
# cor, network, and consensus calculations
#
# ====================================================================================================
.corCalculation <- function(x, y = NULL, weights.x = NULL, weights.y = NULL, correlationOptions) {
if (!inherits(correlationOptions, "CorrelationOptions")) {
stop(".corCalculation: 'correlationOptions' does not have correct type.")
}
do.call(
match.fun(correlationOptions$corFnc),
c(
list(x = x, y = y, weights.x = weights.x, weights.y = weights.y),
correlationOptions$corOptions
)
)
}
# network calculation. Returns the resulting topological overlap or adjacency matrix.
.networkCalculation <- function(data, networkOptions, weights = NULL,
verbose = 1, indent = 0) {
if (is.data.frame(data)) data <- as.matrix(data)
if (!inherits(networkOptions, "NetworkOptions")) {
stop(".networkCalculation: 'networkOptions' does not have correct type.")
}
.checkAndScaleWeights(weights, data, scaleByMax = FALSE)
callVerb <- max(0, verbose - 1)
callInd <- indent + 2
CcorType <- networkOptions$corType.code - 1
CnetworkType <- networkOptions$networkType.code - 1
CTOMType <- networkOptions$TOMType.code - 1
CTOMDenom <- networkOptions$TOMDenom.code - 1
warn <- as.integer(0)
# For now return the full matrix; eventually we may return just the dissimilarity.
# To make full use of the lower triagle space saving we'd have to modify the underlying C code
# dramatically, otherwise it will still need to allocate the full matrix for the matrix multiplication.
.Call("tomSimilarity_call", data, weights,
as.integer(CcorType), as.integer(CnetworkType),
as.double(networkOptions$power), as.integer(CTOMType),
as.integer(CTOMDenom),
as.double(networkOptions$maxPOutliers),
as.double(networkOptions$quickCor),
as.integer(networkOptions$pearsonFallback.code),
as.integer(networkOptions$cosineCorrelation),
as.integer(networkOptions$replaceMissingAdjacencies),
as.integer(networkOptions$suppressTOMForZeroAdjacencies),
as.integer(networkOptions$useInternalMatrixAlgebra),
warn, as.integer(min(1, networkOptions$nThreads)),
as.integer(callVerb), as.integer(callInd),
PACKAGE = "WGCNA"
)
}
# the following is contained in consensusOptions:
# out = list( calibration = calibration,
# calibrationQuantile = calibrationQuantile,
# sampleForCalibration = sampleForCalibration,
# sampleForCalibrationFactor = sampleForCalibrationFactor,
# consensusQuantile = consensusQuantile,
# useMean = useMean,
# setWeights = setWeights);
# ==============================================================================================
#
# general utility functions
#
# ==============================================================================================
# Try to guess whether disk cache should be used
# this should work for both multiData as well as for simple lists of arrays.
.useDiskCache <- function(multiExpr, blocks = NULL, chunkSize = NULL,
nSets = if (isMultiData(multiExpr)) length(multiExpr) else 1,
nGenes = checkSets(multiExpr)$nGenes) {
if (is.null(chunkSize)) chunkSize <- as.integer(.largestBlockSize / (2 * nSets))
if (length(blocks) == 0) {
blockLengths <- nGenes
} else {
blockLengths <- as.numeric(table(blocks))
}
max(blockLengths) > chunkSize
}
.dimensions <- function(x) {
if (is.null(dim(x))) {
return(length(x))
}
return(dim(x))
}
.shiftList <- function(c0, lst) {
if (length(lst) == 0) {
return(list(c0))
}
ll <- length(lst)
out <- list()
out[[1]] <- c0
for (i in 1:ll) {
out[[i + 1]] <- lst[[i]]
}
out
}
.checkListDimConsistencyAndGetDimnames <- function(dataList) {
nPars <- length(dataList)
dn <- NULL
for (p in 1:nPars)
{
if (mode(dataList[[p]]) != "numeric") {
stop(paste("Argument number", p, " is not numeric."))
}
if (p == 1) {
dim <- .dimensions(dataList[[p]])
} else {
if (!isTRUE(all.equal(.dimensions(dataList[[p]]), dim))) {
stop("Argument dimensions are not consistent.")
}
}
if (prod(dim) == 0) stop(paste("Argument has zero dimension."))
if (is.null(dn)) dn <- dimnames(dataList[[p]])
}
dn
}
.mtd.checkDimConsistencyAndGetDimnames <- function(mtd) {
nPars <- length(mtd)
dn <- NULL
for (p in 1:nPars)
{
if (mode(mtd[[p]]$data) != "numeric") {
stop(paste("Argument number", p, " is not numeric."))
}
if (p == 1) {
dim <- .dimensions(mtd[[p]]$data)
} else {
if (!isTRUE(all.equal(.dimensions(mtd[[p]]$data), dim))) {
stop("Argument dimensions are not consistent.")
}
}
if (prod(dim) == 0) stop(paste("Argument has zero dimension."))
if (is.null(dn)) dn <- dimnames(mtd[[p]]$data)
}
dn
}
# ==============================================================================================
#
# general utility functions for working with disk cache
#
# ==============================================================================================
.saveChunks <- function(data, chunkSize, fileBase, cacheDir, fileNames = NULL) {
ld <- length(data)
nChunks <- ceiling(ld / chunkSize)
if (is.null(fileNames)) {
if (length(fileBase) != 1) {
stop("Internal error: length of 'fileBase' must be 1.")
}
fileNames <- rep("", nChunks)
x <- 1
for (c in 1:nChunks)
{
fileNames[c] <- tempfile(pattern = fileBase, tmpdir = cacheDir)
# This effectively reserves the file name
save(x, file = fileNames[c])
}
} else {
if (length(fileNames) != nChunks) {
stop("Internal error: length of 'fileNames' must equal the number of chunks.")
}
}
chunkLengths <- rep(0, nChunks)
start <- 1
for (c in 1:nChunks)
{
end <- min(start + chunkSize - 1, ld)
chunkLengths[c] <- end - start + 1
temp <- data[start:end]
save(temp, file = fileNames[c])
start <- end + 1
}
rm(temp) # gc();
list(files = fileNames, chunkLengths = chunkLengths)
}
.loadAsList <- function(file) {
env <- new.env()
load(file = file, envir = env)
as.list(env)
}
.loadObject <- function(file, name = NULL, size = NULL) {
x <- .loadAsList(file)
if (!is.null(name) && (names(x)[1] != name)) {
stop("File ", file, " does not contain object '", name, "'.")
}
obj <- x[[1]]
if (!is.null(size) && (length(obj) != size)) {
stop("Object '", name, "' does not have the correct length.")
}
obj
}
.vector2dist <- function(x) {
n <- length(x)
n1 <- (1 + sqrt(1 + 8 * n)) / 2
if (floor(n1) != n1) stop("Input length not consistent with a distance structure.")
attributes(x) <- list(Size = as.integer(n1), Diag = FALSE, Upper = FALSE)
class(x) <- "dist"
x
}
.emptyDist <- function(nObjects, fill = 0) {
n <- (nObjects * (nObjects - 1)) / 2
.vector2dist(rep(fill, n))
}
.checkAndDelete <- function(files) {
if (length(files) > 0) lapply(as.character(files), function(file) if (file.exists(file)) file.remove(file))
NULL
}
.qorder <- function(data) {
data <- as.numeric(data)
.Call("qorder", data, PACKAGE = "WGCNA")
}
# Actual consensus calculation distilled into one function. data is assumed to have sets in columns
# and samples/observations/whatever in rows. setWeightMat should be a matrix of dimensions (nSets, 1)
# and be normalized to sum=1.
.consensusCalculation.base <- function(data, useMean, setWeightMat, consensusQuantile) {
nSets <- ncol(data)
if (nSets == 1) {
out.list <- list(consensus = as.vector(data))
} else {
if (useMean) {
if (any(is.na(data))) {
finiteMat <- 1 - is.na(data)
data[is.na(data)] <- 0
out <- data %*% setWeightMat / finiteMat %*% setWeightMat
} else {
out <- data %*% setWeightMat
}
out.list <- list(consensus = out)
} else if (consensusQuantile == 0) {
whichmin <- .Call("minWhich_call", data, 1L, PACKAGE = "WGCNA")
out.list <- list(consensus = whichmin$min)
} else {
out.list <- list(consensus = rowQuantileC(data, p = consensusQuantile))
}
}
if (is.null(out.list$originCount)) {
out.list$originCount <- apply(data, 2, function(x) sum(x <= out.list$consensus, na.rm = TRUE))
names(out.list$originCount) <- 1:nSets
}
out.list
}
.consensusCalculation.base.FromList <- function(dataList, useMean, setWeights, consensusQuantile) {
nSets <- length(dataList)
if (nSets == 1) {
out.list <- list(consensus = dataList[[1]])
} else {
if (useMean) {
out.list <- list(consensus = pmean(dataList, setWeights))
} else if (consensusQuantile == 0) {
whichmin <- pminWhich.fromList(dataList)
out.list <- list(consensus = whichmin$min)
} else {
out.list <- list(consensus = pquantile.fromList(dataList, prob = consensusQuantile))
}
}
if (is.null(out.list$originCount)) {
out.list$originCount <- sapply(dataList, function(x) sum(x <= out.list$consensus, na.rm = TRUE))
names(out.list$originCount) <- 1:nSets
}
out.list
}
# ==============================================================================================
#
# utility functions for working with multiple blockwise adjacencies.
#
# ==============================================================================================
newBlockInformation <- function(
blocks,
goodSamplesAndGenes) {
blockGenes <- tapply(1:length(blocks), blocks, identity)
names(blockGenes) <- sort(unique(blocks))
nGGenes <- sum(goodSamplesAndGenes$goodGenes)
gBlocks <- blocks[goodSamplesAndGenes$goodGenes]
out <- list(
blocks = blocks,
blockGenes = blockGenes,
goodSamplesAndGenes = goodSamplesAndGenes,
nGGenes = nGGenes,
gBlocks = gBlocks
)
class(out) <- c("BlockInformation", class(out))
out
}
# =======================================================================================================
#
# individualTOMs
# This is essentially a re-badged blockwiseIndividualTOMs with a different input and ouptut format.
#
# =======================================================================================================
# The following is contained in networkOptions:
# corType = corType,
# maxPOutliers = maxPOutliers,
# quickCor = quickCor,
# pearsonFallback = pearsonFallback,
# cosineCorrelation = cosineCorrelation,
# corFnc = corFnc,
# corOptions = corOptions,
# corType.code = match(corType, .corTypes),
# Adjacency options
# replaceMissingAdjacencies = TRUE,
# power = 6,
# networkType = c("signed hybrid", "signed", "unsigned"),
# Topological overlap options
# TOMType = c("signed", "unsigned"),
# TOMDenom = c("mean", "min"))
individualTOMs <- function(
multiExpr,
multiWeights = NULL,
multiExpr.imputed = NULL, ## Optional, useful for pre-clustering if preclustering is needed.
# Data checking options
checkMissingData = TRUE,
# Blocking options
blocks = NULL,
maxBlockSize = 5000,
blockSizePenaltyPower = 5,
nPreclusteringCenters = NULL,
randomSeed = 12345,
# Network construction options. This can be a single object of class NetworkOptions, or a multiData
# structure of NetworkOptions objects, one per element of multiExpr.
networkOptions,
# Save individual TOMs? This is equivalent to using external = TRUE in blockwiseData
saveTOMs = TRUE,
individualTOMFileNames = "individualTOM-Set%s-Block%b.RData",
# Behaviour options
collectGarbage = TRUE,
verbose = 2, indent = 0) {
spaces <- indentSpaces(indent)
dataSize <- checkSets(multiExpr, checkStructure = TRUE)
if (dataSize$structureOK) {
nSets <- dataSize$nSets
nGenes <- dataSize$nGenes
multiFormat <- TRUE
} else {
multiExpr <- multiData(multiExpr)
if (!is.null(multiWeights)) multiWeights <- multiData(multiWeights)
nSets <- dataSize$nSets
nGenes <- dataSize$nGenes
multiFormat <- FALSE
}
.checkAndScaleMultiWeights(multiWeights, multiExpr, scaleByMax = FALSE)
if (inherits(networkOptions, "NetworkOptions")) {
networkOptions <- list2multiData(.listRep(networkOptions, nSets))
} else {
if (!all(mtd.apply(networkOptions, inherits, "NetworkOptions", mdaSimplify = TRUE))) {
stop(
"'networkOptions' must be of class 'NetworkOptions' or a multiData structure\n",
" of objects of the class.\n",
" See newNetworkOptions for creating valid network options."
)
}
}
if (is.null(names(multiExpr))) {
names(multiExpr) <- spaste("Set", 1:nSets)
}
if (!is.null(randomSeed)) {
if (exists(".Random.seed")) {
savedSeed <- .Random.seed
on.exit(.Random.seed <<- savedSeed)
}
set.seed(randomSeed)
}
# if (maxBlockSize >= floor(sqrt(2^31)) )
# stop("'maxBlockSize must be less than ", floor(sqrt(2^31)), ". Please decrease it and try again.")
if (!is.null(blocks) && (length(blocks) != nGenes)) {
stop("Input error: length of 'blocks' must equal number of genes in 'multiExpr'.")
}
if (verbose > 0) {
printFlush(paste(spaces, "Calculating topological overlaps block-wise from all genes"))
}
nSamples <- dataSize$nSamples
# Check data for genes and samples that have too many missing values
# Check that multiExpr has valid (mtd)column names. If column names are missing, generate them.
colIDs <- mtd.colnames(multiExpr)
if (is.null(colIDs)) colIDs <- c(1:dataSize$nGenes)
if (checkMissingData) {
gsg <- goodSamplesGenesMS(multiExpr,
multiWeights = multiWeights,
verbose = verbose - 1, indent = indent + 1
)
if (!gsg$allOK) {
multiExpr <- mtd.subset(multiExpr, gsg$goodSamples, gsg$goodGenes)
if (!is.null(multiWeights)) {
multiWeights <- mtd.subset(multiWeights, gsg$goodSamples, gsg$goodGenes)
}
if (!is.null(multiExpr.imputed)) {
multiExpr.imputed <- mtd.subset(multiExpr.imputed, gsg$goodSamples, gsg$goodGenes)
}
colIDs <- colIDs[gsg$goodGenes]
}
} else {
gsg <- list(goodGenes = rep(TRUE, nGenes), goodSamples = lapply(nSamples, function(n) rep(TRUE, n)))
gsg$allOK <- TRUE
}
nGGenes <- sum(gsg$goodGenes)
nGSamples <- rep(0, nSets)
for (set in 1:nSets) nGSamples[set] <- sum(gsg$goodSamples[[set]])
if (is.null(blocks)) {
if (nGGenes > maxBlockSize) {
if (verbose > 1) printFlush(paste(spaces, "....pre-clustering genes to determine blocks.."))
clustering <- consensusProjectiveKMeans(
if (!is.null(multiExpr.imputed)) multiExpr.imputed else multiExpr,
preferredSize = maxBlockSize,
sizePenaltyPower = blockSizePenaltyPower, checkData = FALSE,
nCenters = nPreclusteringCenters,
verbose = verbose - 2, indent = indent + 1
)
gBlocks <- .orderLabelsBySize(clustering$clusters)
} else {
gBlocks <- rep(1, nGGenes)
}
blocks <- rep(NA, nGenes)
blocks[gsg$goodGenes] <- gBlocks
} else {
gBlocks <- blocks[gsg$goodGenes]
}
blockLevels <- as.numeric(levels(factor(gBlocks)))
blockSizes <- table(gBlocks)
nBlocks <- length(blockLevels)
if (any(blockSizes > sqrt(2^31) - 1)) {
printFlush(spaste(
spaces,
"Found block(s) with size(s) larger than limit of 'int' indexing.\n",
spaces, " Support for such large blocks is experimental; please report\n",
spaces, " any issues to Peter.Langfelder@gmail.com."
))
}
# check file names for uniqueness
actualFileNames <- matrix("", nSets, nBlocks)
if (saveTOMs) {
for (set in 1:nSets) {
for (b in 1:nBlocks) {
actualFileNames[set, b] <- .processFileName(individualTOMFileNames, set, names(multiExpr), b)
}
}
rownames(actualFileNames) <- spaste("Set.", c(1:nSets))
colnames(actualFileNames) <- spaste("Block.", c(1:nBlocks))
if (length(unique(as.vector(actualFileNames))) < nSets * nBlocks) {
printFlush("Error: File names for (some) set/block combinations are not unique:")
print(actualFileNames)
stop("File names must be unique.")
}
}
# Initialize various variables
blockGenes <- list()
blockNo <- 1
setTomDS <- replicate(nSets, list())
# Here's where the analysis starts
for (blockNo in 1:nBlocks)
{
if (verbose > 1 && nBlocks > 1) printFlush(paste(spaces, "..Working on block", blockNo, "."))
# Select the block genes
block <- c(1:nGGenes)[gBlocks == blockLevels[blockNo]]
# nBlockGenes = length(block);
# blockGenes[[blockNo]] = c(1:nGenes)[gsg$goodGenes][gBlocks==blockLevels[blockNo]];
# errorOccurred = FALSE;
# For each set: calculate and save TOM
for (set in 1:nSets)
{
if (verbose > 2) printFlush(paste(spaces, "....Working on set", set))
selExpr <- as.matrix(multiExpr[[set]]$data[, block])
if (!is.null(multiWeights)) {
selWeights <- as.matrix(multiWeights[[set]]$data[, block])
} else {
selWeights <- NULL
}
tomDS <- as.dist(.networkCalculation(selExpr, networkOptions[[set]]$data,
weights = selWeights,
verbose = verbose - 2, indent = indent + 2
))
setTomDS[[set]]$data <- addBlockToBlockwiseData(if (blockNo == 1) NULL else setTomDS[[set]]$data,
external = saveTOMs,
blockData = tomDS,
blockFile = actualFileNames[set, blockNo],
recordAttributes = TRUE,
metaData = list(IDs = colIDs[block])
)
}
if (collectGarbage) {
rm(tomDS)
gc()
}
}
names(setTomDS) <- names(multiExpr)
if (!multiFormat) {
gsg$goodSamples <- gsg$goodSamples[[1]]
setTomDS <- setTomDS[[1]]$data
}
blockInformation <- newBlockInformation(blocks, gsg)
list(
blockwiseAdjacencies = setTomDS,
setNames = names(multiExpr),
nSets = length(multiExpr),
blockInfo = blockInformation,
networkOptions = networkOptions
)
}
# =====================================================================================================
#
# hierarchical consensus TOM
#
# =====================================================================================================
hierarchicalConsensusTOM <- function(
# Supply either ...
# ... information needed to calculate individual TOMs
multiExpr,
multiWeights = NULL,
# Data checking options
checkMissingData = TRUE,
# Blocking options
blocks = NULL,
maxBlockSize = 20000,
blockSizePenaltyPower = 5,
nPreclusteringCenters = NULL,
randomSeed = 12345,
# Network construction options. This can be a single object of class NetworkOptions, or a multiData
# structure of NetworkOptions objects, one per element of multiExpr.
networkOptions,
# Save individual TOMs?
keepIndividualTOMs = TRUE,
individualTOMFileNames = "individualTOM-Set%s-Block%b.RData",
# ... or information about individual (more precisely, input) TOMs
individualTOMInfo = NULL,
# Consensus calculation options
consensusTree,
useBlocks = NULL,
# Save calibrated TOMs?
saveCalibratedIndividualTOMs = FALSE,
calibratedIndividualTOMFilePattern = "calibratedIndividualTOM-Set%s-Block%b.RData",
# Return options
saveConsensusTOM = TRUE,
consensusTOMFilePattern = "consensusTOM-%a-Block%b.RData",
getCalibrationSamples = FALSE,
# Return the intermediate results as well?
keepIntermediateResults = saveConsensusTOM,
# Internal handling of TOMs
useDiskCache = NULL, chunkSize = NULL,
cacheDir = ".",
cacheBase = ".blockConsModsCache",
# Behavior
collectGarbage = TRUE,
verbose = 1,
indent = 0) {
if (!is.null(randomSeed)) {
if (exists(".Random.seed")) {
savedSeed <- .Random.seed
on.exit(.Random.seed <<- savedSeed)
}
set.seed(randomSeed)
}
localIndividualTOMCalculation <- is.null(individualTOMInfo)
if (is.null(individualTOMInfo)) {
if (missing(multiExpr)) stop("Either 'individualTOMInfo' or 'multiExpr' must be given.")
if (is.null(useDiskCache)) useDiskCache <- .useDiskCache(multiExpr, blocks, chunkSize)
time <- system.time({
individualTOMInfo <- individualTOMs(
multiExpr = multiExpr,
multiWeights = multiWeights,
checkMissingData = checkMissingData,
blocks = blocks,
maxBlockSize = maxBlockSize,
blockSizePenaltyPower = blockSizePenaltyPower,
nPreclusteringCenters = nPreclusteringCenters,
randomSeed = NULL,
networkOptions = networkOptions,
saveTOMs = useDiskCache | keepIndividualTOMs,
individualTOMFileNames = individualTOMFileNames,
collectGarbage = collectGarbage,
verbose = verbose, indent = indent
)
})
if (verbose > 1) {
printFlush("Timimg for individual TOMs:")
print(time)
}
}
consensus <- hierarchicalConsensusCalculation(individualTOMInfo$blockwiseAdjacencies,
consensusTree,
level = 1,
useBlocks = useBlocks,
randomSeed = NULL,
saveCalibratedIndividualData = saveCalibratedIndividualTOMs,
calibratedIndividualDataFilePattern = calibratedIndividualTOMFilePattern,
saveConsensusData = saveConsensusTOM,
consensusDataFileNames = consensusTOMFilePattern,
getCalibrationSamples = getCalibrationSamples,
keepIntermediateResults = keepIntermediateResults,
useDiskCache = useDiskCache,
chunkSize = chunkSize,
cacheDir = cacheDir,
cacheBase = cacheBase,
collectGarbage = collectGarbage,
verbose = verbose,
indent = indent
)
if (localIndividualTOMCalculation) {
if (!keepIndividualTOMs) {
# individual TOMs are contained in the individualTOMInfo list; remove them.
mtd.apply(individualTOMInfo$blockwiseAdjacencies, BD.checkAndDeleteFiles)
individualTOMInfo$blockwiseAdjacencies <- NULL
}
}
c(
consensus,
list(
individualTOMInfo = individualTOMInfo,
consensusTree = consensusTree
)
)
}
# ========================================================================================================
#
# Merge consensusTOMInfo lists
#
# ========================================================================================================
#
# Caution: at present the function does not check that the inputs are compatible and compatible with the
# supplied blocks.
.mergeConsensusTOMInformationLists <- function(blockInformation, consensusTOMInfoList) {
blocks <- blockInformation$blocks
blockLevels <- unique(blocks)
nBlocks <- length(blockLevels)
out <- consensusTOMInfoList[[1]]
# Merging consensus information
out$consensusData <- do.call(mergeBlockwiseData, lapply(consensusTOMInfoList, getElement, "consensusData"))
if (out$saveCalibratedIndividualData) {
out$calibratedIndividualData <- lapply(1:out$nSets, function(set) {
do.call(
mergeBlockwiseData,
lapply(consensusTOMInfoList, function(ct) ct$calibratedIndividualData[[set]])
)
})
}
if (!is.null(out$calibrationSamples)) {
out$calibrationSamples <- do.call(c, lapply(consensusTOMInfoList, getElement, "calibrationSamples"))
}
out$originCount <- rowSums(do.call(cbind, lapply(consensusTOMInfoList, getElement, "originCount")))
# Merging information in individualTOMInfo
out$individualTOMInfo$blockwiseAdjacencies <- lapply(1:out$nSets, function(set) {
list(data = do.call(mergeBlockwiseData, lapply(consensusTOMInfoList, function(ct) {
ct$individualTOMInfo$blockwiseAdjacencies[[set]]$data
})))
})
out$individualTOMInfo$blockInfo <- blockInformation
out
}
# ========================================================================================================
#
# consensusTOM: old, single-layer consensus.
#
# ========================================================================================================
consensusTOM <- function(
# Supply either ...
# ... information needed to calculate individual TOMs
multiExpr,
# Data checking options
checkMissingData = TRUE,
# Blocking options
blocks = NULL,
maxBlockSize = 5000,
blockSizePenaltyPower = 5,
nPreclusteringCenters = NULL,
randomSeed = 12345,
# Network construction arguments: correlation options
corType = "pearson",
maxPOutliers = 1,
quickCor = 0,
pearsonFallback = "individual",
cosineCorrelation = FALSE,
replaceMissingAdjacencies = FALSE,
# Adjacency function options
power = 6,
networkType = "unsigned",
checkPower = TRUE,
# Topological overlap options
TOMType = "unsigned",
TOMDenom = "min",
# Save individual TOMs?
saveIndividualTOMs = TRUE,
individualTOMFileNames = "individualTOM-Set%s-Block%b.RData",
# ... or individual TOM information
individualTOMInfo = NULL,
useIndivTOMSubset = NULL,
##### Consensus calculation options
useBlocks = NULL,
networkCalibration = c("single quantile", "full quantile", "none"),
# Save calibrated TOMs?
saveCalibratedIndividualTOMs = FALSE,
calibratedIndividualTOMFilePattern = "calibratedIndividualTOM-Set%s-Block%b.RData",
# Simple quantile scaling options
calibrationQuantile = 0.95,
sampleForCalibration = TRUE, sampleForCalibrationFactor = 1000,
getNetworkCalibrationSamples = FALSE,
# Consensus definition
consensusQuantile = 0,
useMean = FALSE,
setWeights = NULL,
# Return options
saveConsensusTOMs = TRUE,
consensusTOMFilePattern = "consensusTOM-Block%b.RData",
returnTOMs = FALSE,
# Internal handling of TOMs
useDiskCache = NULL, chunkSize = NULL,
cacheDir = ".",
cacheBase = ".blockConsModsCache",
nThreads = 1,
# Diagnostic messages
verbose = 1,
indent = 0) {
spaces <- indentSpaces(indent)
networkCalibration <- match.arg(networkCalibration)
seedSaved <- FALSE
if (!is.null(randomSeed)) {
if (exists(".Random.seed")) {
savedSeed <- .Random.seed
on.exit(.Random.seed <<- savedSeed)
}
set.seed(randomSeed)
}
if (any(!is.finite(setWeights))) stop("Entries of 'setWeights' must all be finite.")
localIndividualTOMCalculation <- is.null(individualTOMInfo)
if (is.null(individualTOMInfo)) {
if (missing(multiExpr)) stop("Either 'individualTOMInfo' or 'multiExpr' must be given.")
dataSize <- checkSets(multiExpr)
nSets.all <- dataSize$nSets
nGenes <- dataSize$nGenes
if (is.null(useDiskCache)) useDiskCache <- .useDiskCache(multiExpr, blocks, chunkSize)
if (length(power) != 1) {
if (length(power) != nSets.all) {
stop("Invalid arguments: Length of 'power' must equal number of sets given in 'multiExpr'.")
}
} else {
power <- rep(power, nSets.all)
}
if ((consensusQuantile < 0) | (consensusQuantile > 1)) {
stop("'consensusQuantile' must be between 0 and 1.")
}
time <- system.time({
individualTOMInfo <- blockwiseIndividualTOMs(
multiExpr = multiExpr,
checkMissingData = checkMissingData,
blocks = blocks,
maxBlockSize = maxBlockSize,
blockSizePenaltyPower = blockSizePenaltyPower,
nPreclusteringCenters = nPreclusteringCenters,
randomSeed = NULL,
corType = corType,
maxPOutliers = maxPOutliers,
quickCor = quickCor,
pearsonFallback = pearsonFallback,
cosineCorrelation = cosineCorrelation,
replaceMissingAdjacencies = replaceMissingAdjacencies,
power = power,
networkType = networkType,
TOMType = TOMType,
TOMDenom = TOMDenom,
saveTOMs = useDiskCache | saveIndividualTOMs,
individualTOMFileNames = individualTOMFileNames,
nThreads = nThreads,
verbose = verbose, indent = indent
)
})
if (verbose > 1) {
printFlush("Timimg for individual TOMs:")
print(time)
}
if (!saveIndividualTOMs & useDiskCache) {
on.exit(.checkAndDelete(individualTOMInfo$actualTOMFileNames), add = TRUE)
}
} else {
nSets.all <- if (individualTOMInfo$saveTOMs) {
nrow(individualTOMInfo$actualTOMFileNames)
} else {
ncol(individualTOMInfo$TOMSimilarities[[1]])
}
nGenes <- length(individualTOMInfo$blocks)
if (is.null(useDiskCache)) {
useDiskCache <- .useDiskCache(NULL, blocks, chunkSize,
nSets = nSets.all, nGenes = nGenes
)
}
}
setNames <- individualTOMInfo$setNames
if (is.null(setNames)) setNames <- spaste("Set.", 1:individualTOMInfo$nSets)
nGoodGenes <- length(individualTOMInfo$gBlocks)
if (is.null(setWeights)) setWeights <- rep(1, nSets.all)
if (length(setWeights) != nSets.all) {
stop("Length of 'setWeights' must equal the number of sets.")
}
setWeightMat <- as.matrix(setWeights / sum(setWeights))
if (is.null(useIndivTOMSubset)) {
if (individualTOMInfo$nSets != nSets.all) {
stop(paste(
"Number of sets in individualTOMInfo and in multiExpr do not agree.\n",
" To use a subset of individualTOMInfo, set useIndivTOMSubset appropriately."
))
}
useIndivTOMSubset <- c(1:nSets.all)
}
nSets <- length(useIndivTOMSubset)
if (length(unique(useIndivTOMSubset)) != nSets) {
stop("Entries of 'useIndivTOMSubset' must be unique")
}
if (any(useIndivTOMSubset < 1) | any(useIndivTOMSubset > individualTOMInfo$nSets)) {
stop("All entries of 'useIndivTOMSubset' must be between 1 and the number of sets in individualTOMInfo")
}
# if ( (minKMEtoJoin >1) | (minKMEtoJoin <0) ) stop("minKMEtoJoin must be between 0 and 1.");
gsg <- individualTOMInfo$goodSamplesAndGenes
# Restrict gsg to used sets
gsg$goodSamples <- gsg$goodSamples[useIndivTOMSubset]
if (is.null(chunkSize)) chunkSize <- as.integer(.largestBlockSize / (2 * nSets))
# Initialize various variables
if (getNetworkCalibrationSamples) {
if (!sampleForCalibration) {
stop(paste(
"Incompatible input options: networkCalibrationSamples can only be returned",
"if sampleForCalibration is TRUE."
))
}
networkCalibrationSamples <- list()
}
blockLevels <- sort(unique(individualTOMInfo$gBlocks))
nBlocks <- length(blockLevels)
if (is.null(useBlocks)) useBlocks <- blockLevels
useBlockIndex <- match(useBlocks, blockLevels)
if (!all(useBlocks %in% blockLevels)) {
stop("All entries of 'useBlocks' must be valid block levels.")
}
if (any(duplicated(useBlocks))) {
stop("Entries of 'useBlocks' must be unique.")
}
nUseBlocks <- length(useBlocks)
if (nUseBlocks == 0) {
stop("'useBlocks' cannot be non-NULL and empty at the same time.")
}
consensusTOM.out <- list()
TOMFiles <- rep("", nUseBlocks)
originCount <- rep(0, nSets)
calibratedIndividualTOMFileNames <- NULL
if (saveCalibratedIndividualTOMs) {
calibratedIndividualTOMFileNames <- matrix("", nSets, nBlocks)
for (set in 1:nSets) {
for (b in 1:nBlocks) {
calibratedIndividualTOMFileNames[set, b] <- .processFileName(calibratedIndividualTOMFilePattern,
setNumber = set, blockNumber = b
)
}
}
}
gc()
# Here's where the analysis starts
for (blockIndex in 1:nUseBlocks)
{
blockNo <- useBlockIndex[blockIndex]
if (verbose > 1) printFlush(paste(spaces, "..Working on block", blockNo, "."))
# Select block genes
block <- c(1:nGoodGenes)[individualTOMInfo$gBlocks == blockLevels[blockNo]]
nBlockGenes <- length(block)
# blockGenes[[blockNo]] = c(1:nGenes)[gsg$goodGenes][gBlocks==blockLevels[blockNo]];
scaleQuant <- rep(1, nSets)
scalePowers <- rep(1, nSets)
# Set up file names or memory space to hold the set TOMs
if (useDiskCache) {
nChunks <- ceiling(nBlockGenes * (nBlockGenes - 1) / 2 / chunkSize)
chunkFileNames <- array("", dim = c(nChunks, nSets))
on.exit(.checkAndDelete(chunkFileNames), add = TRUE)
} else {
nChunks <- 1
}
if (nChunks == 1) useDiskCache <- FALSE
if (!useDiskCache) {
# Note: setTomDS will contained the scaled set TOM matrices.
setTomDS <- matrix(0, nBlockGenes * (nBlockGenes - 1) / 2, nSets)
colnames(setTomDS) <- setNames
}
# create an empty consTomDS distance structure.
consTomDS <- .emptyDist(nBlockGenes)
# sample entry indices from the distance structure for TOM scaling, if requested
if (networkCalibration == "single quantile" && sampleForCalibration) {
qx <- min(calibrationQuantile, 1 - calibrationQuantile)
nScGenes <- min(sampleForCalibrationFactor * 1 / qx, length(consTomDS))
nTOMEntries <- length(consTomDS)
scaleSample <- sample(nTOMEntries, nScGenes)
if (getNetworkCalibrationSamples) {
networkCalibrationSamples[[blockIndex]] <- list(
sampleIndex = scaleSample,
TOMSamples = matrix(NA, nScGenes, nSets)
)
}
}
if (networkCalibration %in% c("single quantile", "none")) {
for (set in 1:nSets)
{
if (verbose > 2) printFlush(paste(spaces, "....Working on set", useIndivTOMSubset[set]))
if (individualTOMInfo$saveTOMs) {
tomDS <- .loadObject(individualTOMInfo$ actualTOMFileNames[useIndivTOMSubset[set], blockNo],
name = "tomDS", size = nBlockGenes * (nBlockGenes - 1) / 2
)
} else {
tomDS <- consTomDS
tomDS[] <- individualTOMInfo$TOMSimilarities[[blockNo]] [, useIndivTOMSubset[set]]
}
if (networkCalibration == "single quantile") {
# Scale TOMs so that calibrationQuantile agree in each set
if (sampleForCalibration) {
if (getNetworkCalibrationSamples) {
networkCalibrationSamples[[blockIndex]]$TOMSamples[, set] <- tomDS[scaleSample]
scaleQuant[set] <- quantile(networkCalibrationSamples[[blockIndex]]$TOMSamples[, set],
probs = calibrationQuantile, type = 8
)
} else {
scaleQuant[set] <- quantile(tomDS[scaleSample], probs = calibrationQuantile, type = 8)
}
} else {
scaleQuant[set] <- quantile(x = tomDS, probs = calibrationQuantile, type = 8)
}
if (set > 1) {
scalePowers[set] <- log(scaleQuant[1]) / log(scaleQuant[set])
tomDS <- tomDS^scalePowers[set]
}
if (saveCalibratedIndividualTOMs) {
save(tomDS, file = calibratedIndividualTOMFileNames[set, blockNo])
}
}
# Save the calculated TOM either to disk in chunks or to memory.
if (useDiskCache) {
if (verbose > 3) printFlush(paste(spaces, "......saving TOM similarity to disk cache.."))
sc <- .saveChunks(tomDS, chunkSize, cacheBase, cacheDir = cacheDir)
chunkFileNames[, set] <- sc$files
chunkLengths <- sc$chunkLengths
} else {
setTomDS[, set] <- tomDS[]
}
rm(tomDS)
gc()
}
} else if (networkCalibration == "full quantile") {
# Step 1: load each TOM, get order, split TOM into chunks according to order, and save.
if (verbose > 1) printFlush(spaste(spaces, "..working on quantile normalization"))
if (useDiskCache) {
orderFiles <- rep("", nSets)
on.exit(.checkAndDelete(orderFiles), add = TRUE)
}
for (set in 1:nSets)
{
if (verbose > 2) printFlush(paste(spaces, "....Working on set", useIndivTOMSubset[set]))
if (individualTOMInfo$saveTOMs) {
tomDS <- .loadObject(individualTOMInfo$ actualTOMFileNames[useIndivTOMSubset[set], blockNo],
name = "tomDS", size = nBlockGenes * (nBlockGenes - 1) / 2
)
} else {
tomDS <- consTomDS
tomDS[] <- individualTOMInfo$TOMSimilarities[[blockNo]] [, useIndivTOMSubset[set]]
}
if (useDiskCache) {
# Order TOM (this may take a long time...)
if (verbose > 3) printFlush(spaste(spaces, "......ordering TOM"))
time <- system.time({
order1 <- .qorder(tomDS)
})
if (verbose > 3) {
printFlush("Time to order TOM:")
print(time)
}
# save the order
orderFiles[set] <- tempfile(pattern = spaste(".orderForSet", set), tmpdir = cacheDir)
if (verbose > 3) printFlush(spaste(spaces, "......saving order and ordered TOM"))
save(order1, file = orderFiles[set])
# Save ordered tomDS into chunks
tomDS.ordered <- tomDS[order1]
sc <- .saveChunks(tomDS.ordered, chunkSize, cacheBase, cacheDir = cacheDir)
chunkFileNames[, set] <- sc$files
chunkLengths <- sc$chunkLengths
} else {
setTomDS[, set] <- tomDS[]
}
}
if (useDiskCache) {
# Step 2: Load chunks one by one and quantile normalize
if (verbose > 2) printFlush(spaste(spaces, "....quantile normalizing chunks"))
for (c in 1:nChunks)
{
if (verbose > 3) printFlush(spaste(spaces, "......QN for chunk ", c, " of ", nChunks))
chunkData <- matrix(NA, chunkLengths[c], nSets)
for (set in 1:nSets) {
chunkData[, set] <- .loadObject(chunkFileNames[c, set])
}
time <- system.time({
chunk.norm <- normalize.quantiles(chunkData, copy = FALSE)
})
if (verbose > 1) {
printFlush("Time to QN chunk:")
print(time)
}
# Save quantile normalized chunks
for (set in 1:nSets)
{
temp <- chunk.norm[, set]
save(temp, file = chunkFileNames[c, set])
}
}
if (verbose > 2) printFlush(spaste(spaces, "....putting together full QN'ed TOMs"))
# Put together full TOMs
for (set in 1:nSets)
{
load(orderFiles[set])
start <- 1
for (c in 1:nChunks)
{
end <- start + chunkLengths[c] - 1
tomDS[order1[start:end]] <- .loadObject(chunkFileNames[c, set], size = chunkLengths[c])
start <- start + chunkLengths[c]
}
if (saveCalibratedIndividualTOMs) {
save(tomDS, file = calibratedIndividualTOMFileNames[set, blockNo])
}
.saveChunks(tomDS, chunkSize, fileNames = chunkFileNames[, set])
unlink(orderFiles[set])
}
} else {
# If disk cache is not being used, simply call normalize.quantiles on the full set.
setTomDS <- normalize.quantiles(setTomDS)
if (saveCalibratedIndividualTOMs) {
for (set in 1:nSets)
{
tomDS <- .vector2dist(setTomDS[, set])
save(tomDS, file = calibratedIndividualTOMFileNames[set, blockNo])
}
}
}
} else {
stop("Unrecognized value of 'networkCalibration': ", networkCalibration)
}
# Calculate consensus network
if (verbose > 2) {
printFlush(paste(spaces, "....Calculating consensus network"))
}
if (useDiskCache) {
start <- 1
for (chunk in 1:nChunks)
{
if (verbose > 3) printFlush(paste(spaces, "......working on chunk", chunk))
end <- start + chunkLengths[chunk] - 1
setChunks <- array(0, dim = c(chunkLengths[chunk], nSets))
for (set in 1:nSets)
{
load(file = chunkFileNames[chunk, set])
setChunks[, set] <- temp
file.remove(chunkFileNames[chunk, set])
}
colnames(setChunks) <- setNames
tmp <- .consensusCalculation.base(setChunks,
useMean = useMean, setWeightMat = setWeightMat,
consensusQuantile = consensusQuantile
)
consTomDS[start:end] <- tmp$consensus
countIndex <- as.numeric(names(tmp$originCount))
originCount[countIndex] <- originCount[countIndex] + tmp$originCount
rm(tmp)
start <- end + 1
}
} else {
tmp <- .consensusCalculation.base(setTomDS,
useMean = useMean, setWeightMat = setWeightMat,
consensusQuantile = consensusQuantile
)
consTomDS[] <- tmp$consensus
countIndex <- as.numeric(names(tmp$originCount))
originCount[countIndex] <- originCount[countIndex] + tmp$originCount
rm(tmp)
}
# Save the consensus TOM if requested
if (saveConsensusTOMs) {
TOMFiles[blockIndex] <- .substituteTags(consensusTOMFilePattern, "%b", blockNo)
if (TOMFiles[blockIndex] == consensusTOMFilePattern) {
stop(paste(
"File name for consensus TOM must contain the tag %b somewhere in the file name -\n",
" - this tag will be replaced by the block number. "
))
}
save(consTomDS, file = TOMFiles[blockIndex])
}
if (returnTOMs) consensusTOM.out[[blockIndex]] <- consTomDS
gc()
}
if (!saveConsensusTOMs) TOMFiles <- NULL
if (!returnTOMs) consensusTOM.out <- NULL
if (localIndividualTOMCalculation) {
if (!individualTOMInfo$saveTOMs) {
# individual TOMs are contained in the individualTOMInfo list; remove them.
individualTOMInfo$TOMSimilarities <- NULL
}
}
list(
consensusTOM = consensusTOM.out,
TOMFiles = TOMFiles,
saveConsensusTOMs = saveConsensusTOMs,
individualTOMInfo = individualTOMInfo,
useIndivTOMSubset = useIndivTOMSubset,
goodSamplesAndGenes = gsg,
nGGenes = nGoodGenes,
nSets = nSets,
saveCalibratedIndividualTOMs = saveCalibratedIndividualTOMs,
calibratedIndividualTOMFileNames = calibratedIndividualTOMFileNames,
networkCalibrationSamples = if (getNetworkCalibrationSamples) networkCalibrationSamples else NULL,
consensusQuantile = consensusQuantile,
originCount = originCount
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.