.devianceResidual = function(y)
{
event = y[, ncol(y) ];
fit = summary(coxph(y~1, na.action = na.exclude))
CumHazard = predict(fit, type = "expected")
martingale1 = event - CumHazard
deviance0 = ifelse(event == 0, 2 * CumHazard, -2 * log(CumHazard) +
2 * CumHazard - 2)
sign(martingale1) * sqrt(deviance0);
}
.dropThirdDim = function(x)
{
d = dim(x);
dim(x) = c(d[1], d[2]*d[3]);
x;
}
#-----------------------------------------------------------------------------------------------------
#
# Voting linear predictor: given a set of vectors y and a set of vectors x,
# predictor is given by the correlations of y with x[,i]
#
#-----------------------------------------------------------------------------------------------------
votingLinearPredictor = function(x, y, xtest = NULL,
classify = FALSE,
CVfold = 0,
randomSeed = 12345,
assocFnc = "cor", assocOptions = "use = 'p'",
featureWeightPowers = NULL, priorWeights = NULL,
weighByPrediction = 0,
nFeatures.hi = NULL,
nFeatures.lo = NULL,
dropUnusedDimensions = TRUE,
verbose = 2, indent = 0)
{
# Special handling of a survival response
if (is.Surv(y))
{
pr = votingLinearPredictor(x, .devianceResidual(y), xtest, classify = FALSE,
CVfold = CVfold, randomSeed = randomSeed,
assocFnc = assocFnc, assocOptions = assocOptions,
featureWeightPowers = featureWeightPowers, priorWeights = priorWeights,
nFeatures.hi = nFeatures.hi, nFeatures.lo = nFeatures.lo,
dropUnusedDimensions = dropUnusedDimensions,
verbose = verbose, indent = indent);
# may possibly need completion here.
return(pr)
}
# Standard code for a numeric response
ySaved = y;
if (classify)
{
# Convert factors to numeric variables.
if (is.null(dim(y)))
{
ySaved = list(y);
if (classify)
{
originalYLevels = list( sort(unique(y)) );
y = as.factor(y);
}
y = as.matrix(as.numeric(y));
} else {
ySaved = y;
if (classify)
{
y = as.data.frame(lapply(as.data.frame(y), as.factor));
originalYLevels = lapply(as.data.frame(apply(ySaved, 2, unique)), sort);
} else
y = as.data.frame(y);
y = as.matrix(sapply(lapply(y, as.numeric), I));
}
numYLevels = lapply(as.data.frame(apply(y, 2, unique)), sort);
minY = apply(y, 2, min);
maxY = apply(y, 2, max);
} else
y = as.matrix(y);
doTest = !is.null(xtest)
x = as.matrix(x);
nSamples = nrow(y);
nTraits = ncol(y);
nVars = ncol(x);
if (is.null(featureWeightPowers)) featureWeightPowers = 0;
nPredWPowers = length(featureWeightPowers);
if (is.null(rownames(x)))
{
sampleNames = spaste("Sample.", c(1:nSamples));
} else
sampleNames = rownames(x);
if (doTest)
{
xtest = as.matrix(xtest);
nTestSamples = nrow(xtest);
if (is.null(rownames(xtest)))
{
testSampleNames = spaste("testSample.", c(1:nTestSamples));
} else
testSampleNames = rownames(xtest);
if (ncol(x)!=ncol(xtest))
stop("Number of learning and testing predictors (columns of x, xtest) must equal.");
}
if (is.null(colnames(y)))
{
traitNames = spaste("Response.", c(1:nTraits));
} else
traitNames = colnames(y);
if (is.null(colnames(x)))
{
featureNames = spaste("Feature.", c(1:nVars));
} else
featureNames = colnames(x);
powerNames = spaste("Power.", featureWeightPowers);
spaces = indentSpaces(indent);
# If cross-validation is requested, change the whole flow and use a recursive call.
if (CVfold > 0)
{
if (CVfold > nSamples )
{
printFlush("CVfold is larger than nSamples. Will perform leave-one-out cross-validation.");
CVfold = nSamples;
}
ratio = nSamples/CVfold;
if (floor(ratio)!=ratio)
{
smaller =floor(ratio);
nLarger = nSamples - CVfold * smaller
binSizes = c(rep(smaller, CVfold-nLarger), rep(smaller +1, nLarger));
} else
binSizes = rep(ratio, CVfold);
if (!is.null(randomSeed))
{
if (exists(".Random.seed"))
{
saved.seed = .Random.seed;
seedSaved = TRUE;
} else
seedSaved = FALSE;
set.seed(randomSeed);
}
sampleOrder = sample(1:nSamples);
CVpredicted = array(NA, dim = c(nSamples, nTraits, nPredWPowers));
CVbin = rep(0, nSamples);
if (verbose > 0)
{
cat(paste(spaces, "Running cross-validation: "));
if (verbose==1) pind = initProgInd() else printFlush("");
}
ind = 1;
for (cv in 1:CVfold)
{
if (verbose > 1) printFlush(paste("..cross validation bin", cv, "of", CVfold));
end = ind + binSizes[cv] - 1;
samples = sampleOrder[ind:end];
CVbin[samples] = cv;
xCVtrain = x[-samples, , drop = FALSE];
xCVtest = x[samples, , drop = FALSE];
yCVtrain = y[-samples, , drop = FALSE];
yCVtest = y[samples,, drop = FALSE ];
pr = votingLinearPredictor(xCVtrain, yCVtrain, xCVtest,
classify = FALSE,
CVfold = 0,
assocFnc = assocFnc, assocOptions = assocOptions,
featureWeightPowers = featureWeightPowers,
priorWeights = priorWeights,
nFeatures.hi = nFeatures.hi, nFeatures.lo = nFeatures.lo,
dropUnusedDimensions = dropUnusedDimensions,
verbose = verbose - 1, indent = indent + 1);
CVpredicted[samples, , ] = pr$predictedTest;
ind = end + 1;
if (verbose==1) pind = updateProgInd(cv/CVfold, pind);
}
if (verbose==1) printFlush("");
collectGarbage();
}
if (nrow(x)!=nrow(y))
stop("Number of observations in x and y must equal.");
xSD = apply(x, 2, sd, na.rm = TRUE);
validFeatures = xSD > 0;
xMean = apply(x, 2, mean, na.rm = TRUE);
x = scale(x);
if (doTest)
{
xtest = (xtest - matrix(xMean, nTestSamples, nVars, byrow = TRUE) ) /
matrix(xSD, nTestSamples, nVars, byrow = TRUE);
xtest[, !validFeatures] = 0
}
# This prevents NA's generated from zero xSD to contaminate the results
x[, !validFeatures] = 0
xSD[!validFeatures] = 1;
obsMean = apply(y, 2, mean, na.rm = TRUE);
obsSD = apply(y, 2, sd, na.rm = TRUE);
if (sum(!is.finite(obsSD)) > 0)
stop("Something is wrong with given trait: not all standard deviations are finite.");
if (sum(obsSD==0) > 0)
stop("Some of given traits have variance zero. Prediction on such traits will not work.");
y = scale(y);
if (is.null(priorWeights))
{
priorWeights = array(1, dim = c(nTraits, nPredWPowers, nVars));
} else {
dimPW = dim(priorWeights);
if (length(dimPW)<=1)
{
if (length(priorWeights!=nVars))
stop ("priorWeights are a vector - must have length = number of variables (ncol(x)).");
priorWeights = matrix(priorWeights, nrow = nSamples * nPredWPowers, ncol = nVars,
byrow = TRUE);
dim(priorWeights) = c(nTraits, nPredWPowers, nVars);
} else if (length(dimPW)==2)
{
if ((dimPW[1]!=nPredWPowers) | (dimPW[2]!=nVars) )
stop(paste("If priorWeights is two-dimensional, 1st dimension must equal",
"number of predictor weight powers, 2nd must equal number of variables"));
# if (verbose>0) printFlush("..converting dimensions of priorWeights..");
newWeights = array(0, dim = c(nTraits, nPredWPowers, nVars));
for (trait in 1:nTraits)
newWeights[trait, , ] = priorWeights[,];
priorWeights = newWeights;
collectGarbage();
} else if ( (dimPW[1] != nTraits) | (dimPW[3] != nVars) | (dimPW[2] != nPredWPowers) )
stop(paste("priorWeights have incorrect dimensions. Dimensions must be",
"(ncol(y), length(featureWeightPowers), ncol(x))."));
}
varImportance = array(0, dim = c(nTraits, nPredWPowers, nVars));
predictMat = array(0, dim = c(nSamples, nTraits, nPredWPowers));
if (doTest) predictTestMat = array(0, dim = c(nTestSamples, nTraits, nPredWPowers));
corEval = parse(text = paste(assocFnc, "(x, y ", prepComma(assocOptions), ")"));
r = eval(corEval);
if (!is.null(nFeatures.hi))
{
if (is.null(nFeatures.lo)) nFeatures.lo = nFeatures.hi;
# Zero out associations (and therefore weights) for features that do not make the cut
rank = apply(r, 2, rank, na.last = TRUE);
nFinite = colSums(!is.na(r));
for (t in 1:nTraits)
r[ rank[, t]>nFeatures.lo & rank[, t] <= nFinite[t]-nFeatures.hi, t] = 0;
}
r[is.na(r)] = 0;
validationWeights = rep(1, nVars);
if (weighByPrediction > 0)
{
select = c(1:nVars)[rowSums(r!=0) > 0];
nSelect = length(select);
pr = .quickGeneVotingPredictor.CV(x[, select], xtest[, select], c(1:nSelect))
dCV = sqrt(colMeans( (pr$CVpredicted - x[, select])^2, na.rm = TRUE));
dTS = sqrt(colMeans( (pr$predictedTest - xtest[, select])^2, na.rm = TRUE));
dTS[dTS==0] = min(dTS[dTS>0]);
w = (dCV/dTS)^weighByPrediction;
w[w>1] = 1;
validationWeights[select] = w;
}
finiteX = (is.finite(x) + 1)-1
x.fin = x;
x.fin[!is.finite(x)] = 0;
if (doTest)
{
finiteXTest = (is.finite(xtest) + 1)-1;
xtest.fin = xtest;
xtest.fin[!is.finite(xtest)] = 0;
}
for (power in 1:nPredWPowers)
{
prWM = priorWeights[, power, ];
dim(prWM) = c(nTraits, nVars);
weights = abs(r)^featureWeightPowers[power] * t(prWM) * validationWeights;
#weightSum = apply(weights, 2, sum);
weightSum = finiteX %*% weights;
RWeights = sign(r) * weights;
predictMat[ , , power] =
x.fin %*% RWeights / weightSum;
predMean = apply(.dropThirdDim(predictMat[ , , power, drop = FALSE]), 2, mean, na.rm = TRUE);
predSD = apply(.dropThirdDim(predictMat[, , power, drop = FALSE]), 2, sd, na.rm = TRUE);
predictMat[, , power] = scale(predictMat[, , power]) *
matrix(obsSD, nrow = nSamples, ncol = nTraits, byrow = TRUE) +
matrix(obsMean, nrow = nSamples, ncol = nTraits, byrow = TRUE);
if (doTest)
{
weightSum.test = finiteXTest %*% weights;
predictTestMat[, , power] =
(xtest.fin %*% RWeights / weightSum.test -
matrix(predMean, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) ) /
matrix(predSD, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) *
matrix(obsSD, nrow = nTestSamples, ncol = nTraits, byrow = TRUE) +
matrix(obsMean, nrow = nTestSamples, ncol = nTraits, byrow = TRUE);
}
varImportance[ , power, ] = RWeights;
}
dimnames(predictMat) = list(sampleNames, traitNames, powerNames);
if (doTest) dimnames(predictTestMat) = list(testSampleNames, traitNames, powerNames);
dimnames(r) = list(featureNames, traitNames);
dimnames(varImportance) = list(traitNames, powerNames, featureNames);
discretize = function(x, min, max)
{
fac = round(x);
fac[fac < min] = min;
fac[fac > max] = max;
# dim(fac) = dim(x);
fac;
}
trafo = function(x, drop)
{
if (classify)
{
for (power in 1:nPredWPowers) for (t in 1:nTraits)
{
disc = discretize(x[, t, power], minY[t], maxY[t])
x[, t, power] = originalYLevels[[t]] [disc];
}
}
x = x[ , , , drop = drop];
x;
}
out = list(predicted = trafo(predictMat, drop = dropUnusedDimensions),
weightBase = abs(r),
variableImportance = varImportance)
if (doTest) out$predictedTest = trafo(predictTestMat, drop = dropUnusedDimensions)
if (CVfold > 0)
{
dimnames(CVpredicted) = list(sampleNames, traitNames, powerNames);
CVpredFac = trafo(CVpredicted, drop = dropUnusedDimensions)
out$CVpredicted = CVpredFac;
}
out;
}
#=======================================================================================================
#
# Quick linear predictor for genes.
#
#=======================================================================================================
# Assume we have expression data (training and test). Run the prediction
# in a CV-like way. Keep the predictions for CV samples and for the test samples; at the end average the
# test predictions to get one final test prediction.
# In each CV run:
# scale the training data and keep scale
# scale the test data using the training scale
# get correlations of predicted vectors and predictors (note: number of predicted vectors should be
# relatively small, but all genes may be predictors - however, here we can also make some restrictions)
# Form the ensemble of predictors for each gene
# Predict the CV samples and test samples
.quickGeneVotingPredictor = function(x, xtest, predictedIndex, nPredictorGenes = 20, power = 3,
corFnc = "bicor", corOptions = "use = 'p'", verbose = 0)
{
nSamples = nrow(x)
nTestSamples = nrow(xtest);
nGenes = ncol(x)
nPredicted = length(predictedIndex);
if (nPredictorGenes >=nGenes) nPredictorGenes = nGenes -1;
geneMeans = as.numeric(colMeans(x, na.rm = TRUE));
geneSD = as.numeric(sqrt(colMeans(x^2, na.rm = TRUE) - geneMeans^2));
xScaled = (x-matrix(geneMeans, nSamples, nGenes, byrow = TRUE))/
matrix(geneSD, nSamples, nGenes, byrow = TRUE);
xTestScaled = (xtest-matrix(geneMeans, nTestSamples, nGenes, byrow = TRUE))/
matrix(geneSD, nTestSamples, nGenes, byrow = TRUE);
corExpr = parse(text = paste(corFnc, " (xScaled, xScaled[, predictedIndex] ", prepComma(corOptions), ")"));
significance = eval(corExpr);
predictedTest = matrix(NA, nTestSamples, nPredicted)
if (verbose > 0) pind = initProgInd();
for (g in 1:nPredicted)
{
gg = predictedIndex[g];
sigOrder = order(-significance[, g]);
useGenes = sigOrder[2:(nPredictorGenes+1)];
w.base = significance[useGenes, g]
w = w.base * abs(w.base)^(power -1);
bsub = rowSums(xScaled[, useGenes, drop = FALSE] * matrix(w, nSamples, nPredictorGenes, byrow = TRUE));
bsub.scale = sqrt(mean(bsub^2))
predictedTest[, g] = rowSums(xTestScaled[, useGenes, drop = FALSE] *
matrix(w, nTestSamples, nPredictorGenes, byrow = TRUE)) / bsub.scale *
geneSD[gg] + geneMeans[gg]
if (verbose > 0) pind = updateProgInd(g/nPredicted, pind)
}
predictedTest;
}
.quickGeneVotingPredictor.CV = function(x, xtest = NULL, predictedIndex, nPredictorGenes = 20,
power = 3, CVfold = 10,
corFnc = "bicor", corOptions = "use = 'p'")
{
nSamples = nrow(x)
nTestSamples = if (is.null(xtest)) 0 else nrow(xtest);
nGenes = ncol(x)
nPredicted = length(predictedIndex);
ratio = nSamples/CVfold;
if (floor(ratio)!=ratio)
{
smaller =floor(ratio);
nLarger = nSamples - CVfold * smaller
binSizes = c(rep(smaller, CVfold-nLarger), rep(smaller +1, nLarger));
} else
binSizes = rep(ratio, CVfold);
sampleOrder = sample(1:nSamples);
CVpredicted = matrix(NA, nSamples, nPredicted);
if (!is.null(xtest)) predictedTest = matrix(0, nTestSamples, nPredicted) else predictedTest = NULL;
cvStart = 1;
for (cv in 1:CVfold)
{
end = cvStart + binSizes[cv] - 1;
oob = sampleOrder[cvStart:end];
CVx = x[-oob, , drop = FALSE];
if (is.null(xtest)) CVxTest = x[oob, , drop = FALSE] else CVxTest = rbind( x[oob, , drop = FALSE], xtest);
pred = .quickGeneVotingPredictor(CVx, CVxTest, predictedIndex, nPredictorGenes, power, corFnc,
corOptions);
CVpredicted[oob, ] = pred[c(1:binSizes[cv]), ];
if (!is.null(xtest)) predictedTest = predictedTest + pred[-c(1:binSizes[cv]), ];
cvStart = end + 1;
}
if (!is.null(xtest)) predictedTest = predictedTest/CVfold;
list(CVpredicted = CVpredicted, predictedTest= predictedTest);
}
removePrincipalComponents = function(x, n)
{
if (sum(is.na(x)) > 0) x = t(impute.knn(t(x))$data);
svd = svd(x, nu = n, nv = 0);
PCs = as.data.frame(svd$u);
names(PCs) = spaste("PC", c(1:n));
fit = lm(x~., data = PCs);
res = residuals(fit)
res;
}
#========================================================================================================
#
# Lin's network screening correlation functions
#
#========================================================================================================
.corWeighted = function(expr, y, ...) {
modules = blockwiseModules(expr, ...)
MEs = modules$MEs
MEs = MEs[, colnames(MEs)!="MEgrey"]
ns = networkScreening(y, MEs, expr, getQValues=F)
ns$cor.Weighted
}
.corWeighted.new = function(expr, y, ...) {
modules = blockwiseModules(expr, ...)
MEs = modules$MEs
MEs = MEs[, colnames(MEs)!="MEgrey"]
scaledMEs = scale(MEs)
ES = t(as.matrix(cor( y, MEs, use="p")))
weightedAverageME = as.numeric(as.matrix(scaledMEs)%*%ES)/ncol(MEs)
w=0.25
y.new = w*scale(y) + (1-w)*weightedAverageME
GS.new = as.numeric(cor(y.new, expr, use="p"))
GS.new
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.