.filterSimilarPS <- function(datOut, rowGroup, rowID, thresh = 0.8) {
## Collapse groups (ie. genes) with multiple ids (ie. probes) together using the following algorthim:
# 1) If there is one id/group = keep
# 2) If there are 2 ids/group = take the maximum mean expression, if their correlation is > thresh
# 3) If there are 3+ ids/group = iteratively repeat (2) for the id with the highest
# correlation until all ids remaining have correlation < thresh for each group
# datOut is an expression matrix with rows=ids (NOT group) and cols=samples
# rowGroup and rowID are vectors of corresponding group and id names for the rows in datOut
# (note: all ids in datOut only need to be a subset of these vectors, not necessarily identical)
# thresh is the Pearson correlation threshold to combine probes of similar expression.
names(rowGroup) <- rowID
ids <- rownames(datOut)
idsIn <- ids # For later
group <- rowGroup[ids]
tGroup <- table(group)
twos <- sort(names(tGroup)[tGroup == 2])
more <- sort(names(tGroup)[tGroup > 2])
len <- dim(datOut)[2]
testTwoAndCombine <- function(datIn, datTmp, thresh) {
# Internal function for removing one of two genes if they have high enough correlation
if (cor(as.numeric(datTmp[1, ]), as.numeric(datTmp[2, ])) > thresh) {
rMean <- rowMeans(datTmp)
omit <- as.numeric(which(rMean == min(rMean)))
datIn <- datIn[rownames(datIn) != rownames(datTmp)[omit], ]
}
return(datIn)
} # End internal function "testTwoAndCombine"
for (g in twos) {
datTmp <- datOut[ids[group == g], ]
datOut <- testTwoAndCombine(datOut, datTmp, thresh)
}
write("Done combining genes with 2 probes!", "")
for (g in more) {
go <- TRUE
while (go) {
ids <- rownames(datOut)
group <- rowGroup[ids]
datTmp <- datOut[ids[group == g], ]
corDat <- cor(t(datTmp))
if (length(datTmp) == len) {
go <- FALSE
} else {
diag(corDat) <- -2
if (max(corDat) < thresh) {
go <- FALSE
} else {
w <- which(corDat == max(corDat))[1]
l <- dim(datTmp)[1]
wI <- c(((w - 1) %% l) + 1, ceiling(w / l))
datTmp <- datTmp[wI, ]
datOut <- testTwoAndCombine(datOut, datTmp, thresh)
}
}
}
}
write("Done combining genes with 3+ probes!", "")
idsNew <- rownames(datOut)
groupNew <- rowGroup[idsNew]
out2 <- cbind(groupNew, idsNew)
selectedRow <- is.element(idsIn, idsNew)
names(selectedRow) <- ids
output <- list(datETcollapsed = datOut, group2row = out2, selectedRow = selectedRow)
return(output)
}
.absMaxMean <- function(datIn) {
datIn <- abs(datIn)
keep <- which.max(rowSums(datIn, na.rm = TRUE))[1]
return(as.numeric(datIn[keep, ]))
}
.absMinMean <- function(datIn) {
datIn <- abs(datIn)
keep <- which.min(rowSums(datIn, na.rm = TRUE))[1]
return(as.numeric(datIn[keep, ]))
}
.MaxMean <- function(datIn) {
# Note that this is a matrix version of the "max" function
keep <- which.max(rowSums(datIn, na.rm = TRUE))[1]
return(as.numeric(datIn[keep, ]))
}
.MinMean <- function(datIn) {
# Note that this is a matrix version of the "min" function
keep <- which.min(rowSums(datIn, na.rm = TRUE))[1]
return(as.numeric(datIn[keep, ]))
}
.maxRowVariance <- function(datIn) {
sds <- apply(datIn, 1, function(x) {
return(var(x, na.rm = TRUE))
})
keep <- which.max(sds)[1]
return(as.numeric(datIn[keep, ]))
}
.Average <- function(datIn) {
return(as.numeric(colMeans(datIn)))
}
.selectFewestMissing <- function(datET, rowID, rowGroup, omitGroups, omitPercent = 90) {
## For each gene, select the gene with the fewest missing probes, and return the results.
# If there is a tie, keep all probes involved in the tie.
# The main part of this function is run only if omitGroups=TRUE
# First, return datET if there is no missing data, otherwise run the function
if (sum(is.na(datET)) == 0) {
return(rep(TRUE, nrow(datET)))
}
# Set up the variables.
names(rowGroup) <- rowID
probes <- dimnames(datET)[[1]]
genes <- rowGroup[probes]
keepGenes <- rep(TRUE, length(probes))
tGenes <- table(genes)
checkGenes <- sort(names(tGenes)[tGenes > 1])
missingData <- rowSums(is.na(datET))
# Omit all probes with at least omitPercent genes missing
keep <- missingData < (omitPercent * dim(datET)[2] / 100)
# Omit relevant genes and return results
if (omitGroups) {
for (g in checkGenes) {
gn <- (genes == g)
keepGenes[gn] <- (missingData[gn] == min(missingData[gn]))
}
}
keep <- keep & keepGenes
return(keep)
}
# ----------------- Main Function ------------------- #
collapseRows <- function(datET, rowGroup, rowID, method = "MaxMean", connectivityBasedCollapsing = FALSE,
methodFunction = NULL, connectivityPower = 1, selectFewestMissing = TRUE, thresholdCombine = NA) {
# datET = as.matrix(as.data.frame(datET));
methodAverage <- FALSE
if (method == "Average") methodAverage <- TRUE # Required for later
if (method != "function") methodFunction <- NULL # Required for later
if (sum(rowGroup == "", na.rm = TRUE) > 0) {
warning(paste(
"rowGroup contains blanks. It is strongly recommended that you remove",
"these rows before calling the function.\n",
" But for your convenience, the collapseRow function will remove these rows"
))
rowGroup[rowGroup == ""] <- NA
}
# datET is a numeric matrix whose rows correspond to variables
# e.g. probes of a microarray and whose columns to observations
# e.g. microarrays
if (sum(is.na(rowGroup)) > 0) {
warning(paste(
"The argument rowGroup contains missing data. It is strongly recommended\n",
" that you remove these rows before calling the function. Or redefine rowGroup\n",
" so that it has no missing data. But for convenience, we remove these data."
))
}
## Test to make sure the variables are the right length.
# if not, fix it if possible, or return 0 if not possible
rowID <- as.character(rowID)
rowGroup <- as.character(rowGroup)
rnDat <- rownames(datET)
if (length(rowID) != length(rowGroup)) {
write("Error: rowGroup and rowID not the same length... exiting.", "")
return(0)
}
if (length(unique(rowID)) != length(rowID)) {
stop("rowID contains duplicate entries. Make sure that the argument rowID contains unique entries")
}
names(rowGroup) <- rowID
if (sum(is.na(rowID)) > 0) {
warning("The argument rowID contains missing data. I recommend you choose non-missing, unique values for rowID, e.g. character strings.")
}
if ((is.null(rnDat)) & (dim(datET)[1] == length(rowID))) {
write("Warning: *datET* does not have row names. Assigning *rowID* as row names.", "")
rnDat <- rownames(datET) <- rowID
}
if (is.null(rnDat)) {
write("Error: *datET* does not have row names and length of *rowID*...", "")
write("... is not the same as # rows in *datET*... exiting.", "")
return(0)
}
if (sum(is.element(rnDat, rowID)) != length(rowID)) {
write("Warning: row names of input data and probes not identical...", "")
write("... Attempting to proceed anyway. Check results carefully.", "")
keepProbes <- is.element(rowID, rownames(datET))
rowID <- rowID[keepProbes]
datET <- datET[rowID, ]
rowGroup <- rowGroup[rowID]
}
restRows <- (rowGroup != "" & !is.na(rowGroup))
datET <- datET[restRows, ]
rowGroup <- rowGroup[restRows]
rowID <- rowID[restRows]
rnDat <- rnDat[restRows]
## For each group, select the row with the fewest missing values (if selectFewestMissing==TRUE)
## Also, remove all rows with more than 90% missing data
datET_in <- datET # This will be used as a reference later
## For each gene, select the gene with the fewest missing probes (if selectFewestMissing==TRUE)
## Also, remove all probes with more than 90% missing data
keep <- .selectFewestMissing(datET, rowID, rowGroup, selectFewestMissing)
datET <- datET[keep, ]
rowGroup <- rowGroup[keep]
rowID <- rowID[keep]
rnDat <- rownames(datET)
## If 0 < thresholdCombine < 1, only combine ids into their corresponding group if their
# correlation is greater than thresholdCombine. This parameter supercedes all remaining
# parameters.
if (!is.na(thresholdCombine)) {
if (!is.numeric(thresholdCombine)) {
write("thresholdCombine is not between -1 and 1 and is therefore being treated as NA", "")
} else if ((thresholdCombine < (-1)) | (thresholdCombine > 1)) {
write("thresholdCombine is not between -1 and 1 and is therefore being treated as NA", "")
} else {
output <- .filterSimilarPS(datET, rowGroup, rowID, thresholdCombine)
return(output)
}
}
## If method="function", use the function "methodFunction" as a way of combining genes
# Alternatively, use one of the built-in functions
# Note: methodFunction must be a function that takes a vector of numbers as input and
# outputs a single number. This function will return(0) or crash otherwise.
recMethods <- c("function", "ME", "MaxMean", "maxRowVariance", "MinMean", "absMinMean", "absMaxMean", "Average")
imethod <- pmatch(method, recMethods)
if (is.na(imethod)) {
printFlush("Error: entered method is not a legal option. Recognized options are *maxRowVariance*,")
printFlush(" *maxRowVariance*, *MaxMean*, *MinMean*, *absMaxMean*, *absMinMean*, *ME*,")
printFlush(" *Average* or *function* for a user-defined function.")
return(0)
}
if (imethod > 2) method <- spaste(".", method)
if (method == "function") {
method <- methodFunction
if ((!is.function(methodFunction)) & (!is.null(methodFunction))) {
write("Error: *methodFunction* must be a function... please read the help file", "")
return(0)
}
}
if (!is.function(method)) if (method != "ME") method <- get(method, mode = "function")
## Format the variables for use by this function
rowID[is.na(rowID)] <- rowGroup[is.na(rowID)] # Use group if row is missing
rownames(datET)[is.na(rnDat)] <- rowGroup[is.na(rnDat)]
remove <- (is.na(rowID)) | (is.na(rowGroup)) # Omit if both gene and probe are missing
rowID <- rowID[!remove]
rowGroup <- rowGroup[!remove]
names(rowGroup) <- rowID
rowID <- sort(intersect(rnDat, rowID))
if (length(rowID) <= 1) {
write("Error: none of the *datET* rownames are in *rowID*...", "")
write("... please add rownames and try again... exiting.", "")
return(0)
}
rowGroup <- rowGroup[rowID]
datET <- as.matrix(datET)
datET <- datET[rowID, ]
probes <- rownames(datET)
genes <- rowGroup[probes]
tGenes <- table(genes)
datETOut <- matrix(0, nrow = length(tGenes), ncol = ncol(datET))
colnames(datETOut) <- colnames(datET)
rownames(datETOut) <- sort(names(tGenes))
rowsOut <- rownames(datETOut)
names(rowsOut) <- rowsOut
## If !is.null(connectivityPower), default to the connectivity method with power=method
# Collapse genes with multiple probe sets together using the following algorthim:
# 1) If there is one ps/g = keep
# 2) If there are 2 ps/g = (use "method" or "methodFunction")
# 3) If there are 3+ ps/g = take the max connectivity
# Otherwise, use "method" if there are 3+ ps/g as well.
if (!is.null(connectivityPower)) {
if (!is.numeric(connectivityPower)) {
write("Error: if entered, connectivityPower must be numeric... exiting.", "")
return(0)
}
if (connectivityPower <= 0) {
write("Warning: connectivityPower must be >= 0. Defaulting to a power of 2.", "")
connectivityPower <- 2
}
if (dim(datET)[2] <= 5) {
write("Warning: 5 or fewer samples, this method of probe collapse is unreliable...", "")
write("...Running anyway, but we suggest trying another method (for example, *mean*).", "")
}
}
whichTestFn <- function(x) {
d <- datETOut[g, ]
test <- (!is.na(x)) & (!is.na(d))
return(sum(x[test] == d[test]))
}
# If method=ME, this function acts as the function moduleEigengene from the WGCNA library
if (!is.function(method)) {
if (method == "ME") {
datETOut <- t(moduleEigengenes(t(datET), genes)$eigengenes)
colnames(datETOut) <- colnames(datET)
rownames(datETOut) <- substr(rownames(datETOut), 3, nchar(rownames(datETOut)))
out2 <- cbind(rownames(datETOut), paste("ME", rownames(datETOut), sep = "."))
colnames(out2) <- c("group", "selectedRowID")
out3 <- is.element(rownames(datET_in), "@#$%^&*")
names(out3) <- rownames(datET_in)
return(list(datETcollapsed = datETOut, group2row = out2, selectedRow = out3))
}
}
# Actually run the collapse now!!!
if (!is.null(methodFunction)) {
write("Comment: make sure methodFunction takes a matrix as input.", "")
}
ones <- sort(names(tGenes)[tGenes == 1])
if (connectivityBasedCollapsing) {
twos <- sort(names(tGenes)[tGenes == 2]) # use "method" and connectivity
more <- sort(names(tGenes)[tGenes > 2])
} else {
twos <- sort(names(tGenes)[tGenes > 1]) # only use "method"
more <- character(0)
}
for (g in ones) {
datETOut[g, ] <- as.numeric(datET[probes[genes == g], ])
rowsOut[g] <- probes[genes == g]
}
count <- 0
for (g in twos) {
datETTmp <- datET[probes[genes == g], ]
datETOut[g, ] <- as.numeric(method(datETTmp))
whichTest <- apply(datETTmp, 1, whichTestFn)
rowsOut[g] <- (names(whichTest)[whichTest == max(whichTest)])[1]
count <- count + 1
if (count %% 1000 == 0) collectGarbage()
}
for (g in more) {
datETTmp <- datET[probes[genes == g], ]
adj <- (0.5 + 0.5 * cor(t(datETTmp), use = "p"))^connectivityPower
datETOut[g, ] <- as.numeric(datETTmp[which.max(rowSums(adj, na.rm = TRUE)), ])
whichTest <- apply(datETTmp, 1, whichTestFn)
rowsOut[g] <- (names(whichTest)[whichTest == max(whichTest)])[1]
count <- count + 1
if (count %% 1000 == 0) collectGarbage()
}
if (!is.null(methodFunction)) {
write("...Ignore previous comment. Function completed properly!", "")
}
# Retreive the information about which probes were saved, and include that information
# as part of the output. If method="function" or "Average" output placeholder values.
if (!is.null(methodFunction)) {
out2 <- cbind(rownames(datETOut), paste("function", rownames(datETOut), sep = "."))
colnames(out2) <- c("group", "selectedRowID")
out3 <- is.element(rownames(datET_in), "@#$%^&*")
names(out3) <- rownames(datET_in)
return(list(datETcollapsed = datETOut, group2row = out2, selectedRow = out3))
}
if (methodAverage) {
out2 <- cbind(rownames(datETOut), paste("Average", rownames(datETOut), sep = "."))
colnames(out2) <- c("group", "selectedRowID")
out3 <- is.element(rownames(datET_in), "@#$%^&*")
names(out3) <- rownames(datET_in)
return(list(datETcollapsed = datETOut, group2row = out2, selectedRow = out3))
}
out2 <- cbind(rownames(datETOut), rowsOut)
colnames(out2) <- c("group", "selectedRowID")
out3 <- is.element(rownames(datET_in), rowsOut)
names(out3) <- rownames(datET_in)
output <- list(datETcollapsed = datETOut, group2row = out2, selectedRow = out3)
return(output)
# End of function
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.