# nearest centroid predictor
# 09: Add weighted measures of sample-centroid similarity.
# Fix CVfold such that leave one out cross-validation works.
# 08: change the way sample clustering is called. Use a do.call.
# 07: modify bagging and add boosting
# - revert the predictor to state where it can only take a single trait and a single number of features.
# - make sure the predictor respects nFeatures=0
# -06:
# - add new arguments assocCut.hi and assocCut.lo
# - make nNetworkFeatures equal nFeatures
# = Bagging and boosting is broken.
# 05: add bagging
# 04: add a self-tuning version
# version 03: try to build a sample network predictor.
# Cluster the samples in each class separately and identify clusters. It's a bit of a question whether we
# can automate the cluster identification completely. Anyway, use the clusters as additional centroids and
# as prediction use the class of each centroid.
# version 02: add the option to use a quantile of class distances instead of distance from centroid
# Inverse distance between colunms of x and y
# assume that y has few columns and compute the distance matrix between columns of x and columns of y.
.euclideanDist.forNCP = function(x, y, use = 'p')
{
x = as.matrix(x);
y = as.matrix(y);
ny = ncol(y)
diff = matrix(NA, ncol(x), ny);
for (cy in 1:ny)
diff[, cy] = apply( (x - y[, cy])^2, 2, sum, na.rm = TRUE);
-diff;
}
#=====================================================================================================
#
# Main predictor function.
#
#=====================================================================================================
# best suited to prediction of factors.
# Classification: for each level find nFeatures.eachSide that best distinguish the level from all other
# levels. Actually this doesn't make much sense since I will have to put all the distinguishing sets
# together to form the profiles, so features that may have no relationship to a level will be added if
# there are more than two levels. I can fix that using some roundabout analysis but for now forget it.
# To do: check that the CVfold validation split makes sense, i.e. none of the bins contains all observations
# of any class.
# Could also add robust standardization
# Output a measure of similarity to class centroids
# Same thing for the gene voting predictor
# prediction for heterogenous cases: sample network in training cases get modules and eigensamples and
# similarly in the controls then use all centroids for classification between k nearest neighbor and
# nearest centroid
# CAUTION: the function standardizes each gene (unless standardization is turned off), so the sample
# networks may be different from what would be expected from the supplied data.
# Work internally with just numeric entries. Corresponding levels of the response are saved and restored at
# the very end.
nearestCentroidPredictor = function(
# Input training and test data
x, y,
xtest = NULL,
# Feature weights and selection criteria
featureSignificance = NULL,
assocFnc = "cor", assocOptions = "use = 'p'",
assocCut.hi = NULL, assocCut.lo = NULL,
nFeatures.hi = 10, nFeatures.lo = 10,
weighFeaturesByAssociation = 0,
scaleFeatureMean = TRUE, scaleFeatureVar = TRUE,
# Predictor options
centroidMethod = c("mean", "eigensample"),
simFnc = "cor", simOptions = "use = 'p'",
useQuantile = NULL,
sampleWeights = NULL,
weighSimByPrediction = 0,
# What should be returned
CVfold = 0, returnFactor = FALSE,
# General options
randomSeed = 12345,
verbose = 2, indent = 0)
{
# For now we do not support regression
centroidMethod = match.arg(centroidMethod);
if (simFnc=="dist")
{
if (verbose > 0)
printFlush(paste("NearestCentroidPredictor: 'dist' is changed to a suitable",
"Euclidean distance function.\n",
" Note: simOptions will be disregarded."));
simFnc = ".euclideanDist.forNCP";
simOptions = "use = 'p'"
}
# Convert factors to numeric variables.
ySaved = y;
#if (classify)
#{
originalYLevels = sort(unique(y)) ;
y = as.numeric(as.factor(y));
#}
x = as.matrix(x);
doTest = !is.null(xtest)
if (doTest)
{
xtest = as.matrix(xtest);
nTestSamples = nrow(xtest);
if (ncol(x)!=ncol(xtest))
stop("Number of learning and testing predictors (columns of x, xtest) must equal.");
} else {
if (weighSimByPrediction > 0)
stop("weighting similarity by prediction is not possible when xtest = NULL.");
nTestSamples = 0;
}
numYLevels = sort(unique(y));
minY = min(y);
maxY = max(y);
nSamples = length(y);
nVars = ncol(x);
if (!is.null(assocCut.hi))
{
if (is.null(assocCut.lo)) assocCut.lo = -assocCut.hi;
}
spaces = indentSpaces(indent);
if (!is.null(useQuantile))
{
if ( (useQuantile < 0) | (useQuantile > 1) )
stop("If 'useQuantile' is given, it must be between 0 and 1.");
}
if (is.null(sampleWeights)) sampleWeights = rep(1, nSamples);
# If cross-validation is requested, change the whole flow and use a recursive call.
if (CVfold > 0)
{
if (CVfold > nSamples )
{
printFlush("'CVfold' is larger than number of samples. Will perform leave-one-out cross-validation.");
CVfold = nSamples;
}
ratio = nSamples/CVfold;
if (floor(ratio)!=ratio)
{
smaller = floor(ratio);
nLarger = nSamples - CVfold * smaller
binSizes = c(rep(smaller, CVfold-nLarger), rep(smaller +1, nLarger));
} else
binSizes = rep(ratio, CVfold);
if (!is.null(randomSeed))
{
if (exists(".Random.seed"))
{
saved.seed = .Random.seed;
seedSaved = TRUE;
} else
seedSaved = FALSE;
set.seed(randomSeed);
}
sampleOrder = sample(1:nSamples);
CVpredicted = rep(NA, nSamples);
CVbin = rep(0, nSamples);
if (verbose > 0)
{
cat(paste(spaces, "Running cross-validation: "));
if (verbose==1) pind = initProgInd() else printFlush("");
}
if (!is.null(featureSignificance))
printFlush(paste("Warning in nearestCentroidPredictor: \n",
" cross-validation will be biased if featureSignificance was derived",
"from training data."));
ind = 1;
for (cv in 1:CVfold)
{
if (verbose > 1) printFlush(paste("..cross validation bin", cv, "of", CVfold));
end = ind + binSizes[cv] - 1;
samples = sampleOrder[ind:end];
CVbin[samples] = cv;
xCVtrain = x[-samples, , drop = FALSE];
xCVtest = x[samples, , drop = FALSE];
yCVtrain = y[-samples];
yCVtest = y[samples];
CVsampleWeights = sampleWeights[-samples];
pr = nearestCentroidPredictor(xCVtrain, yCVtrain, xCVtest,
#classify = classify,
featureSignificance = featureSignificance,
assocCut.hi = assocCut.hi,
assocCut.lo = assocCut.lo,
nFeatures.hi = nFeatures.hi,
nFeatures.lo = nFeatures.lo,
useQuantile = useQuantile,
sampleWeights = CVsampleWeights,
CVfold = 0, returnFactor = FALSE,
randomSeed = randomSeed,
centroidMethod = centroidMethod,
assocFnc = assocFnc, assocOptions = assocOptions,
scaleFeatureMean = scaleFeatureMean,
scaleFeatureVar = scaleFeatureVar,
simFnc = simFnc, simOptions = simOptions,
weighFeaturesByAssociation = weighFeaturesByAssociation,
weighSimByPrediction = weighSimByPrediction,
verbose = verbose - 2, indent = indent + 1)
CVpredicted[samples] = pr$predictedTest;
ind = end + 1;
if (verbose==1) pind = updateProgInd(cv/CVfold, pind);
}
if (verbose==1) printFlush("");
}
if (nrow(x)!=length(y))
stop("Number of observations in x and y must equal.");
# Feature selection:
xWeighted = x * sampleWeights;
yWeighted = y * sampleWeights;
if (is.null(featureSignificance))
{
corEval = parse(text = paste(assocFnc, "(xWeighted, yWeighted, ", assocOptions, ")"));
featureSignificance = as.vector(eval(corEval));
} else {
if (length(featureSignificance)!=nVars)
stop("Given 'featureSignificance' has incorrect length (must be nFeatures).");
}
nGood = nVars;
nNA = sum(is.na(featureSignificance));
testCentroidSimilarities = list();
xSD = apply(x, 2, sd, na.rm = TRUE);
keep = is.finite(featureSignificance) & (xSD>0);
nKeep = sum(keep);
keepInd = c(1:nVars)[keep];
order = order(featureSignificance[keep]);
levels = sort(unique(y));
nLevels = length(levels);
if (is.null(assocCut.hi))
{
nf = c(nFeatures.hi, nFeatures.lo);
if (nf[2] > 0) ind1 = c(1:nf[2]) else ind1 = c();
if (nf[1] > 0) ind2 = c((nKeep-nf[1] + 1):nKeep) else ind2 = c();
indexSelect = unique(c(ind1, ind2));
if (length(indexSelect) < 1)
stop("No features were selected. At least one of 'nFeatures.hi', 'nFeatures.lo' must be nonzero.");
indexSelect = indexSelect[indexSelect > 0];
select = keepInd[order[indexSelect]];
} else {
indexSelect = (1:nKeep)[ featureSignificance[keep] >= assocCut.hi |
featureSignificance[keep] <= assocCut.lo ]
if (length(indexSelect)<2)
stop(paste("'assocCut.hi'", assocCut.hi, "and assocCut.lo", assocCut.lo,
"are too stringent, less than 3 features were selected.\n",
"Please relax the cutoffs."));
select = keepInd[indexSelect];
}
if ((length(select) < 3) && (simFnc!='dist'))
{
stop(paste("Less than 3 features were selected. Please either relax",
"the selection criteria of use simFnc = 'dist'."));
}
selectedFeatures = select;
nSelect = length(select);
xSel = x[, select];
selectSignif = featureSignificance[select];
if (scaleFeatureMean)
{
if (scaleFeatureVar)
{
xSD = apply(xSel, 2, sd, na.rm = TRUE);
} else
xSD = rep(1, nSelect);
xMean = apply(xSel, 2, mean, na.rm = TRUE);
} else {
if (scaleFeatureVar)
{
xSD = sqrt(apply(xSel^2, 2, sum, na.rm = TRUE)) / pmax(apply(!is.na(xSel), 2, sum) - 1, rep(1, nSelect));
} else
xSD = rep(1, nSelect);
xMean = rep(0, nSelect);
}
xSel = scale(xSel, center = scaleFeatureMean, scale = scaleFeatureVar);
if (doTest)
{
xtestSel = xtest[, select];
xtestSel = (xtestSel - matrix(xMean, nTestSamples, nSelect, byrow = TRUE) ) /
matrix(xSD, nTestSamples, nSelect, byrow = TRUE);
} else
xtestSel = NULL;
xWeighted = xSel * sampleWeights;
if (weighSimByPrediction > 0)
{
pr = .quickGeneVotingPredictor.CV(xSel, xtestSel, c(1:nSelect))
dCV = sqrt(colMeans( (pr$CVpredicted - xSel)^2, na.rm = TRUE));
dTS = sqrt(colMeans( (pr$predictedTest - xtestSel)^2, na.rm = TRUE));
dTS[dTS==0] = min(dTS[dTS>0]);
validationWeight = (dCV/dTS)^weighSimByPrediction;
validationWeight[validationWeight > 1] = 1;
} else
validationWeight = rep(1, nSelect);
nTestSamples = if (doTest) nrow(xtest) else 0;
predicted = rep(0, nSamples);
predictedTest = rep(0, nTestSamples);
clusterLabels = list();
clusterNumbers = list();
if ( (centroidMethod=="eigensample") )
{
if (sum(is.na(xSel)) > 0)
{
xImp = t(impute.knn(t(xSel), k = min(10, nSelect - 1))$data);
} else
xImp = xSel;
if (doTest && sum(is.na(xtestSel))>0)
{
xtestImp = t(impute.knn(t(xtestSel), k = min(10, nSelect - 1))$data);
} else
xtestImp = xtestSel;
}
clusterNumbers = rep(1, nLevels);
sampleModules = list();
# Trivial cluster labels: clusters equal case classes
for (l in 1:nLevels)
clusterLabels[[l]] = rep(l, sum(y==levels[l]))
nClusters = sum(clusterNumbers);
centroidSimilarities = array(NA, dim = c(nSamples, nClusters));
testCentroidSimilarities = array(NA, dim = c(nTestSamples, nClusters));
#if (classify)
#{
cluster2level = rep(c(1:nLevels), clusterNumbers);
featureWeight = validationWeight;
if (is.null(useQuantile))
{
# Form centroid profiles for each cluster and class
centroidProfiles = array(0, dim = c(nSelect, nClusters));
for (cl in 1:nClusters)
{
l = cluster2level[cl];
clusterSamples = c(1:nSamples)[ y==l ] [ clusterLabels[[l]]==cl ];
if (centroidMethod=="mean")
{
centroidProfiles[, cl] = apply(xSel[clusterSamples, , drop = FALSE],
2, mean, na.rm = TRUE);
} else if (centroidMethod=="eigensample")
{
cp = svd(xSel[clusterSamples,], nu = 0, nv = 1)$v[, 1];
cor = cor(t(xSel[clusterSamples,]), cp);
if (sum(cor, na.rm = TRUE) < 0) cp = -cp;
centroidProfiles[, cl] = cp;
}
}
if (weighFeaturesByAssociation > 0)
featureWeight = featureWeight * sqrt(abs(selectSignif)^weighFeaturesByAssociation);
# Back-substitution prediction:
wcps = centroidProfiles * featureWeight;
wxSel = t(xSel) * featureWeight;
distExpr = spaste( simFnc, "( wcps, wxSel, ", simOptions, ")");
sample.centroidSim = eval(parse(text = distExpr));
# Actual prediction: for each sample, calculate distances to centroid profiles
if (doTest)
{
wxtestSel = t(xtestSel) * featureWeight
distExpr = spaste( simFnc, "( wcps, wxtestSel, ", simOptions, ")");
testSample.centroidSim = eval(parse(text = distExpr));
}
} else {
labelVector = y;
for (l in 1:nLevels)
labelVector[y==l] = clusterLabels[[l]];
keepSamples = labelVector!=0;
nKeepSamples = sum(keepSamples);
keepLabels = labelVector[keepSamples];
if (weighFeaturesByAssociation > 0)
featureWeight = featureWeight * sqrt(abs(selectSignif)^weighFeaturesByAssociation);
wxSel = t(xSel) * featureWeight;
wxSel.keepSamples = t(xSel[keepSamples, ]) * featureWeight;
# Back-substitution prediction:
distExpr = spaste( simFnc, "( wxSel.keepSamples, wxSel, ", simOptions, ")");
dst = eval(parse(text = distExpr));
# Test prediction:
if (doTest)
{
wxtestSel = t(xtestSel) * featureWeight
distExpr = spaste( simFnc, "( wxSel.keepSamples, wxtestSel, ", simOptions, ")");
dst.test = eval(parse(text = distExpr));
sample.centroidSim = matrix(0, nClusters, nSamples);
testSample.centroidSim = matrix(0, nClusters, nTestSamples);
}
for (l in 1:nClusters)
{
#x = try ( {
lSamples = c(1:nKeepSamples)[keepLabels==l];
sample.centroidSim[l, ] = colQuantileC(dst[lSamples, ], 1-useQuantile);
testSample.centroidSim[l, ] = colQuantileC(dst.test[lSamples, ], 1-useQuantile);
#} )
#if (class(x) == 'try-error') browser(text = "zastavka.");
}
}
centroidSimilarities = t(sample.centroidSim);
prediction = cluster2level[apply(sample.centroidSim, 2, which.max)];
# Save predictions
predicted = prediction;
if (doTest)
{
testCentroidSimilarities = t(testSample.centroidSim);
testprediction = cluster2level[apply(testSample.centroidSim, 2, which.max)];
predictedTest = testprediction;
}
#} else
# stop("Prediction for continouos variables is not implemented yet. Sorry!");
# Reformat output if factors are to be returned
if (returnFactor)
{
predicted.out = factor(originalYLevels[[t]][predicted])
if (doTest) predictedTest.out = factor(originalYLevels[[t]][predictedTest]);
if (CVfold > 0)
CVpredicted.out = factor(originalYLevels[[t]][CVpredicted]);
} else {
# Turn ordinal predictions into levels of input traits
predicted.out = originalYLevels[predicted];
if (doTest) predictedTest.out = originalYLevels[predictedTest];
if (CVfold > 0)
CVpredicted.out = originalYLevels[CVpredicted];
}
out = list(predicted = predicted.out,
predictedTest = if (doTest) predictedTest.out else NULL,
featureSignificance = featureSignificance,
selectedFeatures = selectedFeatures,
centroidProfiles = if (is.null(useQuantile)) centroidProfiles else NULL,
testSample2centroidSimilarities = if (doTest) testCentroidSimilarities else NULL,
featureValidationWeights = validationWeight
)
if (CVfold > 0)
out$CVpredicted = CVpredicted.out;
out;
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.