#' @import varSelRF
select_features_varSelRF <- function(x, y, ...){
rfvs <- varSelRF(xdata = x, Class = y,
verbose=FALSE,
recompute.var.imp=FALSE, # tweak
whole.range = TRUE,
ntree=10000, # tweak
ntreeIterat=10000, # tweak
vars.drop.num=NULL,
vars.drop.frac=0.2,
...)
hi <- rfvs$selec.history
idx <- which.min(hi$OOB)
idx <- min(idx + 5, nrow(hi)) # start from 5 steps back
vrz <- hi[idx, "Vars.in.Forest"]
vrz <- strsplit(as.character(vrz)," \\+ ")[[1]]
#
rfvs <- varSelRF(xdata = x[,vrz], Class = y,
verbose=FALSE,
recompute.var.imp=FALSE, # tweak
whole.range = TRUE,
ntree=10000, # tweak
ntreeIterat=10000, # tweak
vars.drop.num=1,
vars.drop.frac=NULL,
...)
return(rfvs$selected.vars)
}
#' @import Boruta
select_features_Boruta <- function(x, y, ...){
# # .. add on ..
# # .. preselection of features using plain random forest
# # .. perhaps I can skimp on Boruta settings
# #
# rf <- randomForest(x, y, data=dSet, ntree = 10000)
# f2 <- names(head(importance(rf)[order(importance(rf)[,1],decreasing=T),],20))
# x <- x[,f2]
# #
rfbo <- Boruta(x, y,
#maxRuns = 100, # 1000
doTrace=0,
#ntree=5000, # 10000
...)
# 5h with 10000
# 10m with 1000
# ~ 5 min with 500
# features_i <- getSelectedAttributes(TentativeRoughFix(rfbo))
features_i <- getSelectedAttributes(rfbo)
if(identical(character(),features_i))
features_i <- getSelectedAttributes(rfbo, withTentative = T)
if(identical(character(),features_i)){
# return just the best one
meds <- apply(rfbo$ImpHistory,2,median)
meds <- meds[names(rfbo$finalDecision)]
features_i <- names(rev(sort(meds)))[1]
}
return(features_i)
}
select_features_top <- function(x, y, ...){
r <- randomForest(x, y, ntree=1000, ...)
ordr <- order(importance(r), decreasing = T)
imp <- importance(r)[ordr,,drop=F]
return(rownames(head(imp, 5)))
}
train_model_rf <- function(x, y, ...){
r <- randomForest(x, y, ntree=1000, ...)
return(r)
}
#' Random Forest Predictive Models
#'
#' The objective is - given the number of features,
#' select the most informative ones and evaluate
#' the predictive random forest model.
#' The feature and model selection performed independently
#' for each round of LOOCV.
#'
# ' @param response character name of the predicted variable
# ' it has to have two values and one of them should be "case"
#' @param msnset MSnSet object
#' @param features character vector features to select from for
#' building prediction model. The features can be either in
#' featureNames(msnset) or in pData(msnset).
#' @param response factor to classify along. Must be only 2 levels.
#' @param pred.cls class to predict
#' @param K specifies the cross-validation type. Default NULL means LOOCV.
#' Another typical value is 10.
#' @param sel.feat logical defining if to select features or use the entire set?
#' @param sel.alg character.
#' \itemize{
#' \item varSelRF
#' \item Boruta
#' \item top - just selecting top 3 features
#' }
#' @param verbose 0 - silent, 1 - count round,
#' 2 - print selected features at each round
#' @param ... Extra arguments. Currently passed only to Boruta algorithm.
#'
#' @return list
#' \describe{
#' \item{\code{prob}}{is the probabilities (response) from LOOCV
#' that the sample is "case"}
#' \item{\code{features}}{list of selected features
#' for each iteration of LOOCV}
#' \item{\code{top}}{top features over all iterations}
#' \item{\code{auc}}{AUC}
#' \item{\code{pred}}{prediction perfomance obtained by
#' \code{ROCR::prediction}}
#' }
#' @import randomForest
#' @importFrom ROCR prediction performance
#' @importFrom parallel mclapply detectCores
#' @export rf_modeling
#'
#' @examples
#' data(srm_msnset)
#' head(varLabels(msnset))
#' head(msnset$subject.type)
#' # reduce to two classes
#' msnset <- msnset[,msnset$subject.type != "control.1"]
#' msnset$subject.type <- as.factor(msnset$subject.type)
# out <- rf_modeling(msnset,
# features=featureNames(msnset)[1:5],
# response="subject.type",
# pred.cls="case")
#' plotAUC(out)
#' # top features consistently recurring in the models during LOOCV
#' print(out$top)
#' # the AUC
#' print(out$auc)
#' # probabilities of classifying the sample right, if the feature selection
#' # and model training was performed on other samples
#' plot(sort(out$prob))
#' abline(h=0.5, col='red')
#'
rf_modeling <- function( msnset, features, response, pred.cls, K=NULL, sel.feat=T,
sel.alg=c("varSelRF","Boruta","top"), ...){
#
if(.Platform$OS.type == "unix")
mc.cores <- detectCores()
else
mc.cores <- 1
# prepare data
dSet <- cbind(pData(msnset), t(exprs(msnset)))
#
stopifnot(length(unique(dSet[,response])) == 2)
stopifnot(pred.cls %in% dSet[,response] )
if(unique(dSet[,response])[1] == pred.cls){
lvlz <- rev(unique(dSet[,response]))
}else{
lvlz <- unique(dSet[,response])
}
dSet[,response] <- factor(dSet[,response], levels=lvlz)
#---
sel.alg <- match.arg(sel.alg)
FUN <- switch(sel.alg,
varSelRF = select_features_varSelRF,
Boruta = function(x,y) select_features_Boruta(x,y,...),
top = select_features_top)
# do K-fold split here
if(is.null(K))
K <- nrow(dSet)
num_rep <- ceiling(nrow(dSet)/K)
cv_idx <- sample(rep(seq_len(K), num_rep)[seq_len(nrow(dSet))])
res <- mclapply(1:K, function(i){
i <- cv_idx == i
if(sel.feat){
features.sel <- FUN(x=dSet[!i,features],
y=dSet[!i,response])
}else{
features.sel <- features
}
# train model
x=dSet[!i,features.sel,drop=F]
colnames(x) <- make.names(colnames(x))
mdl <- train_model_rf(x=x, y=dSet[!i,response])
# predict
newdata <- dSet[i,features.sel,drop=F]
colnames(newdata) <- make.names(colnames(newdata))
predProb <- predict(mdl, newdata=newdata, type='prob')[,pred.cls]
names(predProb) <- rownames(newdata)
# print(i)
list(predProb, features.sel)
}, mc.cores=mc.cores, mc.set.seed = FALSE)
#
predProb <- unlist(sapply(res, '[[', 1, simplify = FALSE)) # unlist TODO
predProb <- predProb[rownames(dSet)]
names(predProb) <- NULL # for compatibility
selected <- sapply(res, '[[', 2)
#---
pred <- prediction(predProb, dSet[,response] == pred.cls)
return(list(prob = predProb,
features = selected,
top = rev(sort(table(unlist(selected)))),
auc = performance(pred,"auc")@y.values[[1]],
pred = pred))
}
rf_modeling_old <- function( msnset, features, response, pred.cls, sel.feat=T,
sel.alg=c("varSelRF","Boruta","top"), ...){
#
if(.Platform$OS.type == "unix")
mc.cores <- detectCores()
else
mc.cores <- 1
# prepare data
dSet <- cbind(pData(msnset), t(exprs(msnset)))
#
stopifnot(length(unique(dSet[,response])) == 2)
stopifnot(pred.cls %in% dSet[,response] )
if(unique(dSet[,response])[1] == pred.cls){
lvlz <- rev(unique(dSet[,response]))
}else{
lvlz <- unique(dSet[,response])
}
dSet[,response] <- factor(dSet[,response], levels=lvlz)
#---
sel.alg <- match.arg(sel.alg)
FUN <- switch(sel.alg,
varSelRF = select_features_varSelRF,
Boruta = function(x,y) select_features_Boruta(x,y,...),
top = select_features_top)
res <- mclapply(1:nrow(dSet), function(i){
if(sel.feat){
features.sel <- FUN(x=dSet[-i,features],
y=dSet[-i,response])
}else{
features.sel <- features
}
# train model
x=dSet[-i,features.sel,drop=F]
colnames(x) <- make.names(colnames(x))
mdl <- train_model_rf(x=x, y=dSet[-i,response])
# predict
newdata <- dSet[i,features.sel,drop=F]
colnames(newdata) <- make.names(colnames(newdata))
predProb <- predict(mdl, newdata=newdata, type='prob')[1,pred.cls]
print(i)
list(predProb, features.sel)
}, mc.cores=mc.cores, mc.set.seed = FALSE)
#
predProb <- sapply(res, '[[', 1)
names(predProb) <- NULL # for compatibility
selected <- sapply(res, '[[', 2)
#---
pred <- prediction(predProb, dSet[,response] == pred.cls)
return(list(prob = predProb,
features = selected,
top = rev(sort(table(unlist(selected)))),
auc = performance(pred,"auc")@y.values[[1]],
pred = pred))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.