Nothing
nlcv <- function(eset,
classVar = "type",
nRuns = 2, # total number of runs i.e. number of splits in training and test set
propTraining = 2/3, # proportion of data in a training set
classdist = c("balanced", "unbalanced"),
nFeatures = c(2, 3, 5, 7, 10, 15, 20, 25, 30, 35), #, 40),
fsMethod = c("randomForest", "t.test", "limma", "none"),
classifMethods = c("dlda", "randomForest", "bagg", "pam", "svm"),
fsPar = NULL,
initialGenes = seq(length.out = nrow(eset)),
geneID = "ID",
storeTestScores = FALSE,
verbose = FALSE){
fsMethod <- match.arg(fsMethod)
if (!(fsMethod %in% c("randomForest", "t.test", "limma", "none")))
stop("argument fsMethod should be one of 'randomForest', 't.test', 'limma' or 'none'")
classifMethodList <- list(dlda = "dlda" %in% classifMethods,
lda = "lda" %in% classifMethods,
nlda = "nlda" %in% classifMethods,
qda = "qda" %in% classifMethods,
glm = "glm" %in% classifMethods,
randomForest = "randomForest" %in% classifMethods,
bagg = "bagg" %in% classifMethods,
pam = "pam" %in% classifMethods,
svm = "svm" %in% classifMethods,
ksvm = "ksvm" %in% classifMethods)
# check on classVar
if (!is.factor(pData(eset)[,classVar]))
stop("'classVar' should be a factor variable")
classdist <- match.arg(classdist)
# sort nFeatures (assumption of mcrPlot)
nFeatures <- sort(nFeatures)
# sample sizes (total, training, test)
## initialize matrices (total, training, test)
## depends on classdist argument
respVar <- pData(eset)[, classVar]
nTotalSample <- length(respVar) # total sample size
if (classdist == "balanced"){
nTrainingSample <- round(propTraining * nTotalSample) # sample size of training (=learning) set
nTestSample <- nTotalSample - nTrainingSample # " validation (=test) set
} else {
smallestClass <- names(sort(table(respVar)))[1]
nSmallest <- sum(respVar == smallestClass)
nSmallestTrain <- round(propTraining * nSmallest)
nBiggestTrain <- nSmallestTrain
nSmallestTest <- nSmallest - nSmallestTrain
nBiggestTest <- nTotalSample - (nSmallestTest + nSmallestTrain + nBiggestTrain)
nTrainingSample <- nSmallestTrain + nBiggestTrain
nTestSample <- nSmallestTest + nBiggestTest
}
totalSampleAllRuns <- matrix(0, nrow = nRuns, ncol = nTotalSample)
trainingSampleAllRuns <- matrix(0, nrow = nRuns, ncol = nTrainingSample)
testSampleAllRuns <- matrix(0, nrow = nRuns, ncol = nTestSample)
indicesTrainingSample <- matrix(NA, nrow = nRuns, ncol = nTotalSample)
for (irun in 1:nRuns) {
indicesTrainingSample[irun, ] <- inTrainingSample(pData(eset)[, classVar],
propTraining = propTraining, classdist = classdist)
trainingSampleAllRuns[irun, ] <- which(indicesTrainingSample[irun, ])
testSampleAllRuns[irun, ] <- which(!indicesTrainingSample[irun, ])
}
### output data structure for features used by the classifiers
feat.outp <- vector(length = nRuns, mode = "list")
### output data structure for errors and predicted classes
## list nested in first level
featuresList <- vector(length = length(nFeatures), mode = "list")
names(featuresList) <- paste("nfeat", nFeatures, sep = "")
## list nested in second level
resultList <- if (storeTestScores){
list(labelsMat = matrix(NA, nrow = nRuns, ncol = nTotalSample)
, testScoresMat = matrix(NA, nrow = nRuns, ncol = nTotalSample)
, errorRate = rep(NA, nRuns)
, AUC = rep(NA, nRuns)
, ROC = vector(mode = "list", length = nRuns)) # for performance objects
} else {
list(labelsMat = matrix(NA, nrow = nRuns, ncol = nTotalSample)
, errorRate = rep(NA, nRuns)
, AUC = rep(NA, nRuns)
, ROC = vector(mode = "list", length = nRuns)) # for performance objects
}
## nest second level in first level
featuresList <- lapply(featuresList, function(x){x <- resultList; return(x)})
## create output data structure
output <- vector(length = length(classifMethods), mode = "list")
names(output) <- classifMethods
output <- lapply(output, function(x){x <- featuresList; return(x)})
for (irun in 1:nRuns){
trainingSampleRun <- trainingSampleAllRuns[irun, ]
testSampleRun <- testSampleAllRuns[irun, ]
### Feature Selection
switch(fsMethod,
none = {
orderedEset <- eset
fNames <- featureNames(eset)
featRF <- rep(1, length(fNames))
names(featRF) <- fNames
feat.outp[[irun]] <- featRF
# nothing is done
},
randomForest = {
# allow for customized use
mtry <- if (is.null(fsPar$mtry)) as.integer(sqrt(length(initialGenes)))
else fsPar$mtry
# if (mtry > max(nFeatures))
# stop("'mtry' component of 'fsPar' cannot exceed max(nFeatures)")
ntree <- if (is.null(fsPar$ntree)) 500
else fsPar$ntree
rf <- randomForest(x = t(exprs(eset[initialGenes, ])),
y = pData(eset)[,classVar],
mtry= mtry,
importance=TRUE)
importanceRf <- importance(rf)
# TV: MLearn method blows up memory for some reason :-(
# rf <- MLearn(formula = as.formula(paste(classVar, "~ .", sep = " ")),
# data = eset[initialGenes, ],
# trainInd = as.integer(trainingSampleRun),
# .method = randomForestI,
# importance = TRUE,
# ntree = ntree,
# mtry = mtry)
# order features from highest to lowest variable importance
orderedGenesRF <- order(importanceRf[, 3], decreasing = TRUE)
orderedEset <- eset[orderedGenesRF, ]
featRF <- importanceRf[orderedGenesRF, 3]
feat.outp[[irun]] <- featRF},
t.test = {
fsPar$test <- if (is.null(fsPar$test)) "f" else fsPar$test
yl.num <- as.numeric(factor(pData(eset)[trainingSampleRun, classVar])) - 1
xl.mtt <- exprs(eset[, trainingSampleRun])
f.stat <- mt.teststat(xl.mtt, yl.num, fsPar$test) # "f" by default
orderedGenesTT <- order(f.stat, decreasing = TRUE)
orderedEset <- eset[orderedGenesTT, ]
featTT <- f.stat[orderedGenesTT]
names(featTT) <- rownames(exprs(eset))[orderedGenesTT]
feat.outp[[irun]] <- featTT},
limma = {
limmaTopTable <- limmaTwoGroups(eset[, trainingSampleRun],
group = classVar)
limmaTopTableGeneIDs <- as.character(limmaTopTable[, "ROWNAMES"])
# if (geneID == "ENTREZID"){
# # limmaTopTableGeneIDs <- limmaTopTableGeneIDs[!is.na(limmaTopTableGeneIDs)]
# limmaTopTableGeneIDs <- paste(limmaTopTableGeneIDs, "_at", sep = "")
# }
orderedEset <- eset[limmaTopTableGeneIDs, ] # features sorted by increasing P-values
featLimma <- limmaTopTable$P.Value
names(featLimma) <- limmaTopTableGeneIDs
feat.outp[[irun]] <- featLimma
}) # end of switch
### Classification
for (iNumber in seq(along = nFeatures)){
## use first nFeatures[iNumber]] rows (i.e. features)
nFeaturesEset <- orderedEset[1:nFeatures[iNumber], ]
## wrapper for all classification algorithms
results <-
# make conditional on BioC release
classify_bioc_2.3(nFeaturesEset,
trainingSample = trainingSampleRun,
testSample = testSampleRun,
classVar = classVar,
classifMethodList = classifMethodList)
## extract errors, predicted classes and (for continuous outputs) predicted values
classVarLevels <- levels(pData(eset)[[classVar]])
for (iMethod in classifMethods){
# MCR
output[[iMethod]][[iNumber]][["errorRate"]][irun] <- results[[iMethod]]
# predicted values
iPredic <- paste(iMethod, "predic", sep = ".")
if (!(iPredic %in% c("glm.predic", "nlda.predic"))){
output[[iMethod]][[iNumber]][["labelsMat"]][irun,!indicesTrainingSample[irun, ]] <-
as.character(testPredictions(results[[iPredic]]))
} else {
output[[iMethod]][[iNumber]][["labelsMat"]][irun,!indicesTrainingSample[irun, ]] <-
classVarLevels[as.numeric(testPredictions(results[[iPredic]]))]
# test scores
if (storeTestScores){
output[[iMethod]][[iNumber]][["testScoresMat"]][irun,!indicesTrainingSample[irun, ]] <-
testScores(results[[iPredic]])
}
# AUC
iAUC <- paste(iMethod, "AUC", sep = ".")
output[[iMethod]][[iNumber]][["AUC"]][irun] <- results[[iAUC]]
# ROC curve (performance object)
iROC <- paste(iMethod, "ROC", sep = ".")
output[[iMethod]][[iNumber]][["ROC"]][irun] <- results[[iROC]]
}
}
}
}
res <- list(output = output, features = feat.outp) #, trueClasses = pData(eset)[[classVar]])
attr(res, "nFeatures") <- nFeatures
tmpClassVar <- pData(eset)[, classVar]
names(tmpClassVar) <- sampleNames(eset)
attr(res, "classVar") <- tmpClassVar
class(res) <- "nlcv"
return(res)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.