.devianceResidual <- function(y) {
event <- y[, ncol(y)]
fit <- summary(coxph(y ~ 1, na.action = na.exclude))
CumHazard <- predict(fit, type = "expected")
martingale1 <- event - CumHazard
deviance0 <- ifelse(event == 0, 2 * CumHazard, -2 * log(CumHazard) +
2 * CumHazard - 2)
sign(martingale1) * sqrt(deviance0)
}
.dropThirdDim <- function(x) {
d <- dim(x)
dim(x) <- c(d[1], d[2] * d[3])
x
}
#-----------------------------------------------------------------------------------------------------
#
# Voting linear predictor: given a set of vectors y and a set of vectors x,
# predictor is given by the correlations of y with x[,i]
#
#-----------------------------------------------------------------------------------------------------
votingLinearPredictor <- function(x, y, xtest = NULL,
classify = FALSE,
CVfold = 0,
randomSeed = 12345,
assocFnc = "cor", assocOptions = "use = 'p'",
featureWeightPowers = NULL, priorWeights = NULL,
weighByPrediction = 0,
nFeatures.hi = NULL,
nFeatures.lo = NULL,
dropUnusedDimensions = TRUE,
verbose = 2, indent = 0) {
# Special handling of a survival response
if (is.Surv(y)) {
pr <- votingLinearPredictor(x, .devianceResidual(y), xtest,
classify = FALSE,
CVfold = CVfold, randomSeed = randomSeed,
assocFnc = assocFnc, assocOptions = assocOptions,
featureWeightPowers = featureWeightPowers, priorWeights = priorWeights,
nFeatures.hi = nFeatures.hi, nFeatures.lo = nFeatures.lo,
dropUnusedDimensions = dropUnusedDimensions,
verbose = verbose, indent = indent
)
# may possibly need completion here.
return(pr)
}
# Standard code for a numeric response
ySaved <- y
if (classify) {
# Convert factors to numeric variables.
if (is.null(dim(y))) {
ySaved <- list(y)
if (classify) {
originalYLevels <- list(sort(unique(y)))
y <- as.factor(y)
}
y <- as.matrix(as.numeric(y))
} else {
ySaved <- y
if (classify) {
y <- as.data.frame(lapply(as.data.frame(y), as.factor))
originalYLevels <- lapply(as.data.frame(apply(ySaved, 2, unique)), sort)
} else {
y <- as.data.frame(y)
}
y <- as.matrix(sapply(lapply(y, as.numeric), I))
}
numYLevels <- lapply(as.data.frame(apply(y, 2, unique)), sort)
minY <- apply(y, 2, min)
maxY <- apply(y, 2, max)
} else {
y <- as.matrix(y)
}
doTest <- !is.null(xtest)
x <- as.matrix(x)
nSamples <- nrow(y)
nTraits <- ncol(y)
nVars <- ncol(x)
if (is.null(featureWeightPowers)) featureWeightPowers <- 0
nPredWPowers <- length(featureWeightPowers)
if (is.null(rownames(x))) {
sampleNames <- spaste("Sample.", c(1:nSamples))
} else {
sampleNames <- rownames(x)
}
if (doTest) {
xtest <- as.matrix(xtest)
nTestSamples <- nrow(xtest)
if (is.null(rownames(xtest))) {
testSampleNames <- spaste("testSample.", c(1:nTestSamples))
} else {
testSampleNames <- rownames(xtest)
}
if (ncol(x) != ncol(xtest)) {
stop("Number of learning and testing predictors (columns of x, xtest) must equal.")
}
}
if (is.null(colnames(y))) {
traitNames <- spaste("Response.", c(1:nTraits))
} else {
traitNames <- colnames(y)
}
if (is.null(colnames(x))) {
featureNames <- spaste("Feature.", c(1:nVars))
} else {
featureNames <- colnames(x)
}
powerNames <- spaste("Power.", featureWeightPowers)
spaces <- indentSpaces(indent)
# If cross-validation is requested, change the whole flow and use a recursive call.
if (CVfold > 0) {
if (CVfold > nSamples) {
printFlush("CVfold is larger than nSamples. Will perform leave-one-out cross-validation.")
CVfold <- nSamples
}
ratio <- nSamples / CVfold
if (floor(ratio) != ratio) {
smaller <- floor(ratio)
nLarger <- nSamples - CVfold * smaller
binSizes <- c(rep(smaller, CVfold - nLarger), rep(smaller + 1, nLarger))
} else {
binSizes <- rep(ratio, CVfold)
}
if (!is.null(randomSeed)) {
if (exists(".Random.seed")) {
saved.seed <- .Random.seed
seedSaved <- TRUE
} else {
seedSaved <- FALSE
}
set.seed(randomSeed)
}
sampleOrder <- sample(1:nSamples)
CVpredicted <- array(NA, dim = c(nSamples, nTraits, nPredWPowers))
CVbin <- rep(0, nSamples)
if (verbose > 0) {
cat(paste(spaces, "Running cross-validation: "))
if (verbose == 1) pind <- initProgInd() else printFlush("")
}
ind <- 1
for (cv in 1:CVfold)
{
if (verbose > 1) printFlush(paste("..cross validation bin", cv, "of", CVfold))
end <- ind + binSizes[cv] - 1
samples <- sampleOrder[ind:end]
CVbin[samples] <- cv
xCVtrain <- x[-samples, , drop = FALSE]
xCVtest <- x[samples, , drop = FALSE]
yCVtrain <- y[-samples, , drop = FALSE]
yCVtest <- y[samples, , drop = FALSE]
pr <- votingLinearPredictor(xCVtrain, yCVtrain, xCVtest,
classify = FALSE,
CVfold = 0,
assocFnc = assocFnc, assocOptions = assocOptions,
featureWeightPowers = featureWeightPowers,
priorWeights = priorWeights,
nFeatures.hi = nFeatures.hi, nFeatures.lo = nFeatures.lo,
dropUnusedDimensions = dropUnusedDimensions,
verbose = verbose - 1, indent = indent + 1
)
CVpredicted[samples, , ] <- pr$predictedTest
ind <- end + 1
if (verbose == 1) pind <- updateProgInd(cv / CVfold, pind)
}
if (verbose == 1) printFlush("")
collectGarbage()
}
if (nrow(x) != nrow(y)) {
stop("Number of observations in x and y must equal.")
}
xSD <- apply(x, 2, sd, na.rm = TRUE)
validFeatures <- xSD > 0
xMean <- apply(x, 2, mean, na.rm = TRUE)
x <- scale(x)
if (doTest) {
xtest <- (xtest - matrix(xMean, nTestSamples, nVars, byrow = TRUE)) /
matrix(xSD, nTestSamples, nVars, byrow = TRUE)
xtest[, !validFeatures] <- 0
}
# This prevents NA's generated from zero xSD to contaminate the results
x[, !validFeatures] <- 0
xSD[!validFeatures] <- 1
obsMean <- apply(y, 2, mean, na.rm = TRUE)
obsSD <- apply(y, 2, sd, na.rm = TRUE)
if (sum(!is.finite(obsSD)) > 0) {
stop("Something is wrong with given trait: not all standard deviations are finite.")
}
if (sum(obsSD == 0) > 0) {
stop("Some of given traits have variance zero. Prediction on such traits will not work.")
}
y <- scale(y)
if (is.null(priorWeights)) {
priorWeights <- array(1, dim = c(nTraits, nPredWPowers, nVars))
} else {
dimPW <- dim(priorWeights)
if (length(dimPW) <= 1) {
if (length(priorWeights != nVars)) {
stop("priorWeights are a vector - must have length = number of variables (ncol(x)).")
}
priorWeights <- matrix(priorWeights,
nrow = nSamples * nPredWPowers, ncol = nVars,
byrow = TRUE
)
dim(priorWeights) <- c(nTraits, nPredWPowers, nVars)
} else if (length(dimPW) == 2) {
if ((dimPW[1] != nPredWPowers) | (dimPW[2] != nVars)) {
stop(paste(
"If priorWeights is two-dimensional, 1st dimension must equal",
"number of predictor weight powers, 2nd must equal number of variables"
))
}
# if (verbose>0) printFlush("..converting dimensions of priorWeights..");
newWeights <- array(0, dim = c(nTraits, nPredWPowers, nVars))
for (trait in 1:nTraits) {
newWeights[trait, , ] <- priorWeights[, ]
}
priorWeights <- newWeights
collectGarbage()
} else if ((dimPW[1] != nTraits) | (dimPW[3] != nVars) | (dimPW[2] != nPredWPowers)) {
stop(paste(
"priorWeights have incorrect dimensions. Dimensions must be",
"(ncol(y), length(featureWeightPowers), ncol(x))."
))
}
}
varImportance <- array(0, dim = c(nTraits, nPredWPowers, nVars))
predictMat <- array(0, dim = c(nSamples, nTraits, nPredWPowers))
if (doTest) predictTestMat <- array(0, dim = c(nTestSamples, nTraits, nPredWPowers))
corEval <- parse(text = paste(assocFnc, "(x, y ", prepComma(assocOptions), ")"))
r <- eval(corEval)
if (!is.null(nFeatures.hi)) {
if (is.null(nFeatures.lo)) nFeatures.lo <- nFeatures.hi
# Zero out associations (and therefore weights) for features that do not make the cut
rank <- apply(r, 2, rank, na.last = TRUE)
nFinite <- colSums(!is.na(r))
for (t in 1:nTraits) {
r[rank[, t] > nFeatures.lo & rank[, t] <= nFinite[t] - nFeatures.hi, t] <- 0
}
}
r[is.na(r)] <- 0
validationWeights <- rep(1, nVars)
if (weighByPrediction > 0) {
select <- c(1:nVars)[rowSums(r != 0) > 0]
nSelect <- length(select)
pr <- .quickGeneVotingPredictor.CV(x[, select], xtest[, select], c(1:nSelect))
dCV <- sqrt(colMeans((pr$CVpredicted - x[, select])^2, na.rm = TRUE))
dTS <- sqrt(colMeans((pr$predictedTest - xtest[, select])^2, na.rm = TRUE))
dTS[dTS == 0] <- min(dTS[dTS > 0])
w <- (dCV / dTS)^weighByPrediction
w[w > 1] <- 1
validationWeights[select] <- w
}
finiteX <- (is.finite(x) + 1) - 1
x.fin <- x
x.fin[!is.finite(x)] <- 0
if (doTest) {
finiteXTest <- (is.finite(xtest) + 1) - 1
xtest.fin <- xtest
xtest.fin[!is.finite(xtest)] <- 0
}
for (power in 1:nPredWPowers)
{
prWM <- priorWeights[, power, ]
dim(prWM) <- c(nTraits, nVars)
weights <- abs(r)^featureWeightPowers[power] * t(prWM) * validationWeights
# weightSum = apply(weights, 2, sum);
weightSum <- finiteX %*% weights
RWeights <- sign(r) * weights
predictMat[, , power] <-
x.fin %*% RWeights / weightSum
predMean <- apply(.dropThirdDim(predictMat[, , power, drop = FALSE]), 2, mean, na.rm = TRUE)
predSD <- apply(.dropThirdDim(predictMat[, , power, drop = FALSE]), 2, sd, na.rm = TRUE)
predictMat[, , power] <- scale(predictMat[, , power]) *
matrix(obsSD, nrow = nSamples, ncol = nTraits, byrow = TRUE) +
matrix(obsMean, nrow = nSamples, ncol = nTraits, byrow = TRUE)
if (doTest) {
weightSum.test <- finiteXTest %*% weights
predictTestMat[, , power] <-
(xtest.fin %*% RWeights / weightSum.test -
matrix(predMean, nrow = nTestSamples, ncol = nTraits, byrow = TRUE)) /
matrix(predSD, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) *
matrix(obsSD, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) +
matrix(obsMean, nrow = nTestSamples, ncol = nTraits, byrow = TRUE)
}
varImportance[, power, ] <- RWeights
}
dimnames(predictMat) <- list(sampleNames, traitNames, powerNames)
if (doTest) dimnames(predictTestMat) <- list(testSampleNames, traitNames, powerNames)
dimnames(r) <- list(featureNames, traitNames)
dimnames(varImportance) <- list(traitNames, powerNames, featureNames)
discretize <- function(x, min, max) {
fac <- round(x)
fac[fac < min] <- min
fac[fac > max] <- max
# dim(fac) = dim(x);
fac
}
trafo <- function(x, drop) {
if (classify) {
for (power in 1:nPredWPowers) {
for (t in 1:nTraits)
{
disc <- discretize(x[, t, power], minY[t], maxY[t])
x[, t, power] <- originalYLevels[[t]] [disc]
}
}
}
x <- x[, , , drop = drop]
x
}
out <- list(
predicted = trafo(predictMat, drop = dropUnusedDimensions),
weightBase = abs(r),
variableImportance = varImportance
)
if (doTest) out$predictedTest <- trafo(predictTestMat, drop = dropUnusedDimensions)
if (CVfold > 0) {
dimnames(CVpredicted) <- list(sampleNames, traitNames, powerNames)
CVpredFac <- trafo(CVpredicted, drop = dropUnusedDimensions)
out$CVpredicted <- CVpredFac
}
out
}
# =======================================================================================================
#
# Quick linear predictor for genes.
#
# =======================================================================================================
# Assume we have expression data (training and test). Run the prediction
# in a CV-like way. Keep the predictions for CV samples and for the test samples; at the end average the
# test predictions to get one final test prediction.
# In each CV run:
# scale the training data and keep scale
# scale the test data using the training scale
# get correlations of predicted vectors and predictors (note: number of predicted vectors should be
# relatively small, but all genes may be predictors - however, here we can also make some restrictions)
# Form the ensemble of predictors for each gene
# Predict the CV samples and test samples
.quickGeneVotingPredictor <- function(x, xtest, predictedIndex, nPredictorGenes = 20, power = 3,
corFnc = "bicor", corOptions = "use = 'p'", verbose = 0) {
nSamples <- nrow(x)
nTestSamples <- nrow(xtest)
nGenes <- ncol(x)
nPredicted <- length(predictedIndex)
if (nPredictorGenes >= nGenes) nPredictorGenes <- nGenes - 1
geneMeans <- as.numeric(colMeans(x, na.rm = TRUE))
geneSD <- as.numeric(sqrt(colMeans(x^2, na.rm = TRUE) - geneMeans^2))
xScaled <- (x - matrix(geneMeans, nSamples, nGenes, byrow = TRUE)) /
matrix(geneSD, nSamples, nGenes, byrow = TRUE)
xTestScaled <- (xtest - matrix(geneMeans, nTestSamples, nGenes, byrow = TRUE)) /
matrix(geneSD, nTestSamples, nGenes, byrow = TRUE)
corExpr <- parse(text = paste(corFnc, " (xScaled, xScaled[, predictedIndex] ", prepComma(corOptions), ")"))
significance <- eval(corExpr)
predictedTest <- matrix(NA, nTestSamples, nPredicted)
if (verbose > 0) pind <- initProgInd()
for (g in 1:nPredicted)
{
gg <- predictedIndex[g]
sigOrder <- order(-significance[, g])
useGenes <- sigOrder[2:(nPredictorGenes + 1)]
w.base <- significance[useGenes, g]
w <- w.base * abs(w.base)^(power - 1)
bsub <- rowSums(xScaled[, useGenes, drop = FALSE] * matrix(w, nSamples, nPredictorGenes, byrow = TRUE))
bsub.scale <- sqrt(mean(bsub^2))
predictedTest[, g] <- rowSums(xTestScaled[, useGenes, drop = FALSE] *
matrix(w, nTestSamples, nPredictorGenes, byrow = TRUE)) / bsub.scale *
geneSD[gg] + geneMeans[gg]
if (verbose > 0) pind <- updateProgInd(g / nPredicted, pind)
}
predictedTest
}
.quickGeneVotingPredictor.CV <- function(x, xtest = NULL, predictedIndex, nPredictorGenes = 20,
power = 3, CVfold = 10,
corFnc = "bicor", corOptions = "use = 'p'") {
nSamples <- nrow(x)
nTestSamples <- if (is.null(xtest)) 0 else nrow(xtest)
nGenes <- ncol(x)
nPredicted <- length(predictedIndex)
ratio <- nSamples / CVfold
if (floor(ratio) != ratio) {
smaller <- floor(ratio)
nLarger <- nSamples - CVfold * smaller
binSizes <- c(rep(smaller, CVfold - nLarger), rep(smaller + 1, nLarger))
} else {
binSizes <- rep(ratio, CVfold)
}
sampleOrder <- sample(1:nSamples)
CVpredicted <- matrix(NA, nSamples, nPredicted)
if (!is.null(xtest)) predictedTest <- matrix(0, nTestSamples, nPredicted) else predictedTest <- NULL
cvStart <- 1
for (cv in 1:CVfold)
{
end <- cvStart + binSizes[cv] - 1
oob <- sampleOrder[cvStart:end]
CVx <- x[-oob, , drop = FALSE]
if (is.null(xtest)) CVxTest <- x[oob, , drop = FALSE] else CVxTest <- rbind(x[oob, , drop = FALSE], xtest)
pred <- .quickGeneVotingPredictor(
CVx, CVxTest, predictedIndex, nPredictorGenes, power, corFnc,
corOptions
)
CVpredicted[oob, ] <- pred[c(1:binSizes[cv]), ]
if (!is.null(xtest)) predictedTest <- predictedTest + pred[-c(1:binSizes[cv]), ]
cvStart <- end + 1
}
if (!is.null(xtest)) predictedTest <- predictedTest / CVfold
list(CVpredicted = CVpredicted, predictedTest = predictedTest)
}
removePrincipalComponents <- function(x, n) {
if (sum(is.na(x)) > 0) x <- t(impute.knn(t(x))$data)
svd <- svd(x, nu = n, nv = 0)
PCs <- as.data.frame(svd$u)
names(PCs) <- spaste("PC", c(1:n))
fit <- lm(x ~ ., data = PCs)
res <- residuals(fit)
res
}
# ========================================================================================================
#
# Lin's network screening correlation functions
#
# ========================================================================================================
.corWeighted <- function(expr, y, ...) {
modules <- blockwiseModules(expr, ...)
MEs <- modules$MEs
MEs <- MEs[, colnames(MEs) != "MEgrey"]
ns <- networkScreening(y, MEs, expr, getQValues = F)
ns$cor.Weighted
}
.corWeighted.new <- function(expr, y, ...) {
modules <- blockwiseModules(expr, ...)
MEs <- modules$MEs
MEs <- MEs[, colnames(MEs) != "MEgrey"]
scaledMEs <- scale(MEs)
ES <- t(as.matrix(cor(y, MEs, use = "p")))
weightedAverageME <- as.numeric(as.matrix(scaledMEs) %*% ES) / ncol(MEs)
w <- 0.25
y.new <- w * scale(y) + (1 - w) * weightedAverageME
GS.new <- as.numeric(cor(y.new, expr, use = "p"))
GS.new
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.