# ============================================================================================================
#
# mdx version of collapseRows
#
# ============================================================================================================
.cr.absMaxMean <- function(x, robust) {
if (robust) {
colMedians(abs(x), na.rm = TRUE)
} else {
colMeans(abs(x), na.rm = TRUE)
}
}
.cr.absMinMean <- function(x, robust) {
if (robust) {
-colMedians(abs(x), na.rm = TRUE)
} else {
-colMeans(abs(x), na.rm = TRUE)
}
}
.cr.MaxMean <- function(x, robust) {
if (robust) {
colMedians(x, na.rm = TRUE)
} else {
colMeans(x, na.rm = TRUE)
}
}
.cr.MinMean <- function(x, robust) {
if (robust) {
-colMedians(x, na.rm = TRUE)
} else {
-colMeans(x, na.rm = TRUE)
}
}
.cr.maxVariance <- function(x, robust) {
if (robust) {
colMads(x, na.rm = TRUE)
} else {
colSds(x, na.rm = TRUE)
}
}
.checkConsistencyOfGroupAndColID <- function(mdx, colID, group) {
colID <- as.character(colID)
group <- as.character(group)
if (length(colID) != length(group)) {
stop("'group' and 'colID' must have the same length.")
}
if (any(duplicated(colID))) {
stop("'colID' contains duplicate entries.")
}
rnDat <- mtd.colnames(mdx)
if (sum(is.na(colID)) > 0) {
warning(spaste(
"The argument colID contains missing data. It is recommend you choose non-missing,\n",
"unique values for colID, e.g. character strings."
))
}
if (sum(group == "", na.rm = TRUE) > 0) {
warning(paste(
"group contains blanks. It is strongly recommended that you remove",
"these rows before calling the function.\n",
" But for your convenience, the collapseRow function will remove these rows"
))
group[group == ""] <- NA
}
if (sum(is.na(group)) > 0) {
warning(paste(
"The argument group contains missing data. It is strongly recommended\n",
" that you remove these rows before calling the function. Or redefine group\n",
" so that it has no missing data. But for convenience, we remove these data."
))
}
if ((is.null(rnDat)) & (checkSets(mdx)$nGenes == length(colID))) {
write("Warning: mdx does not have column names. Using 'colID' as column names.", "")
rnDat <- colID
mdx <- mtd.setColnames(mdx, colID)
}
if (is.null(rnDat)) {
stop(
"'mdx' does not have row names and \n",
"length of 'colID' is not the same as # variables in mdx."
)
}
keepProbes <- rep(TRUE, checkSets(mdx)$nGenes)
if (sum(is.element(rnDat, colID)) != length(colID)) {
write("Warning: row names of input data and probes not identical...", "")
write("... Attempting to proceed anyway. Check results carefully.", "")
keepProbes <- is.element(colID, rnDat)
colID <- colID[keepProbes]
mdx <- mtd.subset(mdx, , colID)
group <- group[colID]
}
restCols <- (group != "" & !is.na(group))
if (any(!restCols)) {
keepProbes[keepProbes] <- restCols
mdx <- mtd.subset(mdx, , restCols)
group <- group[restCols]
colID <- colID[restCols]
rnDat <- rnDat[restCols]
}
list(mdx = mdx, group = group, colID = colID, keepProbes = keepProbes)
}
selectFewestConsensusMissing <- function(mdx, colID, group,
minProportionPresent = 1,
consensusQuantile = 0,
verbose = 0, ...) {
## For each gene, select the gene with the fewest missing probes, and return the results.
# If there is a tie, keep all probes involved in the tie.
# The main part of this function is run only if omitGroups=TRUE
otherArgs <- list(...)
nVars <- checkSets(mdx)$nGenes
nSamples <- checkSets(mdx)$nSamples
nSets <- length(mdx)
if ((!"checkConsistency" %in% names(otherArgs)) || otherArgs$checkConsistency) {
cd <- .checkConsistencyOfGroupAndColID(mdx, colID, group)
mdx <- cd$mdx
group <- cd$group
colID <- cd$colID
keep <- cd$keepProbes
} else {
keep <- rep(TRUE, nVars)
}
# First, return datET if there is no missing data, otherwise run the function
if (sum(mtd.apply(mdx, function(x) sum(is.na(x)), mdaSimplify = TRUE)) == 0) {
return(rep(TRUE, nVars))
}
# Set up the variables.
names(group) <- colID
probes <- mtd.colnames(mdx)
genes <- group[probes]
keepGenes <- rep(TRUE, nVars)
tGenes <- table(genes)
checkGenes <- sort(names(tGenes)[tGenes > 1])
presentData <- as.matrix(mtd.apply(mdx, function(x) colSums(is.finite(x)), mdaSimplify = TRUE))
presentFrac <- presentData / matrix(nSamples, nVars, nSets, byrow = TRUE)
consensusPresentFrac <- .consensusCalculation.base(
data = presentFrac, useMean = FALSE,
setWeightMat = NULL,
consensusQuantile = consensusQuantile
)$consensus
# Omit all probes with at least omitFrac genes missing
# keep = consensusPresentFrac > omitFraction
minProportionPresent <- as.numeric(minProportionPresent)
# Omit relevant genes and return results
if (minProportionPresent > 0) {
if (verbose) pind <- initProgInd()
for (gi in 1:length(checkGenes))
{
g <- checkGenes[gi]
gn <- which(genes == g)
keepGenes[gn] <- (consensusPresentFrac[gn] >= minProportionPresent * max(consensusPresentFrac[gn]))
if (verbose) pind <- updateProgInd(gi / length(checkGenes), pind)
}
if (verbose) printFlush("")
}
keep[keep] <- keepGenes
return(keep)
}
# ----------------- Main Function ------------------- #
consensusRepresentatives <- function(mdx,
group, colID,
consensusQuantile = 0,
method = "MaxMean",
useGroupHubs = TRUE,
calibration = c("none", "full quantile"),
selectionStatisticFnc = NULL,
connectivityPower = 1,
minProportionPresent = 1,
getRepresentativeData = TRUE,
statisticFncArguments = list(),
adjacencyArguments = list(),
verbose = 2, indent = 0)
# Change in methodFunction: if the methodFunction picks a single representative, it should return it in
# attribute "selectedRepresentative".
# minProportionPresent now gives the fraction of the maximum of present values that will still be included.
# minProportionPresent=1 corresponds to minProportionPresent=TRUE in original collapseRows.
# In connectivity-based collapsing, use simple connectivity, do not normalize. This way the connectivities
# retain a larger spread which should prevent quantile normalization from making big changes and potentially
# suprious changes.
{
if (!is.null(dim(mdx))) {
warning("consensusRepresentatives: wrapping matrix-like input into a mdx structure.")
mdx <- multiData(mdx)
}
spaces <- indentSpaces(indent)
nSamples <- checkSets(mdx)$nSamples
nSets <- length(mdx)
colnames.in <- mtd.colnames(mdx)
calibration <- match.arg(calibration)
## Test to make sure the variables are the right length.
# if not, fix it if possible, or stop.
cd <- .checkConsistencyOfGroupAndColID(mdx, colID, group)
colID <- cd$colID
group <- cd$group
mdx <- cd$mdx
keepVars <- cd$keepProbes
rnDat <- mtd.colnames(mdx)
## For each gene, select the gene with the fewest missing probes (if minProportionPresent==TRUE)
## Also, remove all probes with more than 90% missing data
if (verbose > 0) {
printFlush(spaste(spaces, "..selecting variables with lowest numbers of missing data.."))
}
keep <- selectFewestConsensusMissing(mdx, colID, group, minProportionPresent,
consensusQuantile = consensusQuantile, verbose = verbose - 1
)
mdx <- mtd.subset(mdx, , keep)
keepVars[keepVars] <- keep
group <- group[keep]
colID <- colID[keep]
rnDat <- mtd.colnames(mdx)
## If method="function", use the function "methodFunction" as a way of combining genes
# Alternatively, use one of the built-in functions
# Note: methodFunction must be a function that takes a vector of numbers as input and
# outputs a single number. This function will return(0) or crash otherwise.
recMethods <- c("function", "MaxMean", "maxVariance", "MinMean", "absMinMean", "absMaxMean")
imethod <- pmatch(method, recMethods)
if (is.na(imethod)) {
stop(
"Error: entered method is not a legal option. Recognized options are\n",
" *maxVariance*, *MaxMean*, *MinMean*, *absMaxMean*, *absMinMean*\n",
" or *function* for a user-defined function."
)
}
if (imethod > 1) {
selectionStatisticFnc <- spaste(".cr.", method)
selStatFnc <- get(selectionStatisticFnc, mode = "function")
} else {
selStatFnc <- match.fun(selectionStatisticFnc)
if ((!is.function(selStatFnc)) & (!is.null(selStatFnc))) {
stop("Error: 'selectionStatisticFnc must be a function... please read the help file.")
}
}
## Format the variables for use by this function
colID[is.na(colID)] <- group[is.na(colID)] # Use group if row is missing
rnDat[is.na(rnDat)] <- group[is.na(rnDat)]
mdx <- mtd.setColnames(mdx, rnDat)
remove <- (is.na(colID)) | (is.na(group)) # Omit if both gene and probe are missing
colID <- colID[!remove]
group <- group[!remove]
names(group) <- colID
colID <- sort(intersect(rnDat, colID))
if (length(colID) <= 1) {
stop("None of the variable names in 'mdx' are in 'colID'.")
}
group <- group[colID]
mdx <- mtd.apply(mdx, as.matrix)
keepVars[keepVars] <- mtd.colnames(mdx) %in% colID
mdx <- mtd.subset(mdx, , colID)
probes <- mtd.colnames(mdx)
genes <- group[probes]
tGenes <- table(genes)
colnames.out <- sort(names(tGenes))
if (getRepresentativeData) {
mdxOut <- mtd.apply(mdx, function(x) {
out <- matrix(0, nrow(x), length(tGenes))
rownames(out) <- rownames(x)
colnames(out) <- colnames.out
out
})
names(mdxOut) <- names(mdx)
}
representatives <- rep("", length(colnames.out))
names(representatives) <- colnames.out
## If !is.null(connectivityPower), default to the connectivity method with power=method
# Collapse genes with multiple probe sets together using the following algorthim:
# 1) If there is one ps/g = keep
# 2) If there are 2 ps/g = (use "method" or "methodFunction")
# 3) If there are 3+ ps/g = take the max connectivity
# Otherwise, use "method" if there are 3+ ps/g as well.
if (!is.null(connectivityPower)) {
if (!is.numeric(connectivityPower)) {
stop("Error: if entered, connectivityPower must be numeric.")
}
if (connectivityPower <= 0) {
stop("Warning: connectivityPower must be >= 0.")
}
if (any(nSamples <= 5)) {
write("Warning: 5 or fewer samples, this method of probe collapse is unreliable...", "")
write("...Running anyway, but we suggest trying another method (for example, *mean*).", "")
}
}
# Run selectionStatisticFnc on all data; if quantile normalization is requested, normalize the selection
# statistics across data sets.
selectionStatistics <- mtd.apply(mdx, function(x) {
do.call(selStatFnc, c(list(x), statisticFncArguments))
},
mdaSimplify = TRUE
)
# if (FALSE) xxx = selectionStatistics;
if (is.null(dim(selectionStatistics))) {
stop("Calculation of selection statistics produced results of zero or unqual lengths.")
}
if (calibration == "full quantile") {
selectionStatistics <- normalize.quantiles(selectionStatistics)
}
# if (FALSE)
# {
# sizeGrWindow(14, 5);
# par(mfrow = c(1,3));
# for (set in 1:nSets)
# hist(xxx[, set], breaks = 200);
#
## for (set in 1:nSets)
# verboseScatterplot(xxx[, set], selectionStatistics[, set], samples = 10000)
# }
consensusSelStat <- .consensusCalculation.base(selectionStatistics,
useMean = FALSE, setWeightMat = NULL,
consensusQuantile = consensusQuantile
)$consensus
# Actually run the summarization.
ones <- sort(names(tGenes)[tGenes == 1])
if (useGroupHubs) {
twos <- sort(names(tGenes)[tGenes == 2]) # use "method" and connectivity
more <- sort(names(tGenes)[tGenes > 2])
} else {
twos <- sort(names(tGenes)[tGenes > 1]) # only use "method"
more <- character(0)
}
ones2genes <- match(ones, genes)
if (getRepresentativeData) {
for (set in 1:nSets) {
mdxOut[[set]]$data[, ones] <- mdx[[set]]$data[, ones2genes]
}
}
representatives[ones] <- probes[ones2genes]
count <- 0
if (length(twos) > 0) {
if (verbose > 0) {
printFlush(spaste(spaces, "..selecting representatives for 2-variable groups.."))
}
if (verbose > 1) pind <- initProgInd(paste(spaces, ".."))
repres <- rep(NA, length(twos))
for (ig in 1:length(twos))
{
g <- twos[ig]
probeIndex <- which(genes == g)
repres[ig] <- probeIndex[which.max(consensusSelStat[probeIndex])]
if (verbose > 1) pind <- updateProgInd(ig / length(twos), pind)
}
if (verbose > 1) printFlush("")
if (getRepresentativeData) {
for (set in 1:nSets) {
mdxOut[[set]]$data[, twos] <- mdx[[set]]$data[, repres]
}
}
representatives[twos] <- probes[repres]
}
if (length(more) > 0) {
if (verbose > 0) {
printFlush(spaste(spaces, "..selecting representatives for 3-variable groups.."))
}
if (verbose > 1) pind <- initProgInd(paste(spaces, ".."))
genes.more <- genes[genes %in% more]
nAll <- length(genes.more)
connectivities <- matrix(NA, nAll, nSets)
for (ig in 1:length(more))
{
g <- more[ig]
keepProbes1 <- which(genes == g)
keep.inMore <- which(genes.more == g)
mdxTmp <- mtd.subset(mdx, , keepProbes1)
adj <- mtd.apply(mdxTmp, function(x) {
do.call(
adjacency,
c(list(x, type = "signed", power = connectivityPower), adjacencyArguments)
)
})
connectivities[keep.inMore, ] <- mtd.apply(adj, colSums, mdaSimplify = TRUE)
count <- count + 1
if (count %% 50000 == 0) collectGarbage()
if (verbose > 1) pind <- updateProgInd(ig / (2 * length(more)), pind)
}
if (calibration == "full quantile") {
connectivities <- normalize.quantiles(connectivities)
}
consConn <- .consensusCalculation.base(connectivities,
useMean = FALSE, setWeightMat = NULL,
consensusQuantile = consensusQuantile
)$consensus
repres.inMore <- rep(0, length(more))
for (ig in 1:length(more))
{
probeIndex <- which(genes.more == more[ig])
repres.inMore[ig] <- probeIndex[which.max(consConn[probeIndex])]
if (verbose > 1) pind <- updateProgInd(ig / (2 * length(more)) + 0.5, pind)
}
repres <- which(genes %in% more)[repres.inMore]
if (verbose > 1) printFlush("")
if (getRepresentativeData) {
for (set in 1:nSets) {
mdxOut[[set]]$data[, more] <- as.numeric(mdx[[set]]$data[, repres])
}
}
representatives[more] <- probes[repres]
}
# Retreive the information about which probes were saved, and include that information
# as part of the output.
out2 <- cbind(colnames.out, representatives)
colnames(out2) <- c("group", "selectedColID")
reprIndicator <- keepVars
reprIndicator[keepVars] [match(representatives, mtd.colnames(mdx))] <- TRUE
reprIndicator <- colnames.in
out <- list(
representatives = out2,
varSelected = reprIndicator,
representativeData = if (getRepresentativeData) mdxOut else NULL
)
return(out)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.