#' Importance-based feature selection.
#'
#' \code{Features_Importance} estimates feature importance values using the Random Forest algorithm.\cr
#' \code{Features_Importance_Reduce} reduces the number of the retained features.\cr
#'
#' @param preprocessedDFList An output generated by \code{Features_Preprocess} or \code{Features_CorFilter}.
#' @param featureImportanceDFList An output generated by \code{Features_Importance}.
#' @param featureN The number of the features that should be retained. Can be \code{NULL} to disable feature selection.
#' @export
#' @rdname Features_Importance
#' @name Features_Importance
Features_Importance <- function(preprocessedDFList, featureN=100){
time.start <- proc.time()
# Input check
if(any(class(preprocessedDFList[[1]])=="data.frame")) preprocessedDFList <- lapply(preprocessedDFList, function(d){list("dt"=d)})
# Tasks
preprocessedDFList_to_taskList <- function(preprocessedDFList, maxRowN=Inf){
pbapply::pblapply(1:length(preprocessedDFList), function(i){
param <- names(preprocessedDFList)[i]
s <- try(as.integer(rev(unlist(stringr::str_split(param, stringr::fixed("."))))[1]), silent=T)
if(any(class(s)=="try-error")) s <- 123456789 ## ad hoc seed
set.seed(s)
dt <- data.table::as.data.table(preprocessedDFList[[i]][["dt"]])
dt <- dt[DataType=="Train",]
dt[,DataType:=NULL][,Peptide:=NULL][,Cluster:=NULL]
if(nrow(dt)>maxRowN) dt <- dplyr::slice(dt, sample(1:nrow(dt), maxRowN))
dt <- as.data.frame(dt)
target <- dt$"Immunogenicity"
tab <- as.numeric(table(target))
w <- 1/tab[target]
tsk <- mlr::makeClassifTask(data=dt, target="Immunogenicity", weights=w)
return(tsk)
})
}
message("Converting preprocessed dataframes into a list of tasks...")
taskSet <- preprocessedDFList_to_taskList(preprocessedDFList, maxRowN=Inf)
gc();gc()
# Importances
message("Calculating feature importances...")
featureImportances <- pbapply::pblapply(taskSet, mlr::generateFilterValuesData, method="randomForest_importance")
gc();gc()
featureImportanceDFList <- lapply(1:length(preprocessedDFList), function(i){
param <- names(preprocessedDFList)[i]
imp <- featureImportances[[i]][["data"]] %>%
dplyr::transmute(FeatureID=name, Importance=scales::rescale(value, to=c(0,1)), Parameter=param) %>%
DescTools::Sort(ord="Importance", decreasing=T)
if(!is.null(featureN)){
feat <- as.character(dplyr::slice(imp, seq(1, min(nrow(imp), featureN)))[["FeatureID"]])
}else{
feat <- as.character(imp[["FeatureID"]])
}
dt <- data.table::as.data.table(preprocessedDFList[[i]][["dt"]])
dt <- dt[, c("DataType", "Peptide", "Immunogenicity", "Cluster", feat), with=F]
gc();gc()
list("dt"=dt, "feat"=feat, "imp"=imp)
})
names(featureImportanceDFList) <- names(preprocessedDFList)
time.end <- proc.time()
message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
return(featureImportanceDFList)
}
#' @export
#' @rdname Features_Importance
#' @name Features_Importance
Features_Importance_Reduce <- function(featureImportanceDFList, featureN=30){
lapply(featureImportanceDFList, function(l){
feat <- l$feat[1:featureN]
dt <- l$dt[, c("DataType", "Peptide", "Immunogenicity", "Cluster", feat), with=F]
gc();gc()
list("dt"=dt, "feat"=feat, "imp"=l$imp)
})
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.