R/Features_Importance.R

Defines functions Features_Importance_Reduce Features_Importance

Documented in Features_Importance Features_Importance_Reduce

#' Importance-based feature selection.
#'
#' \code{Features_Importance} estimates feature importance values using the Random Forest algorithm.\cr
#' \code{Features_Importance_Reduce} reduces the number of the retained features.\cr
#'
#' @param preprocessedDFList An output generated by \code{Features_Preprocess} or \code{Features_CorFilter}.
#' @param featureImportanceDFList An output generated by \code{Features_Importance}.
#' @param featureN The number of the features that should be retained. Can be \code{NULL} to disable feature selection.
#' @export
#' @rdname Features_Importance
#' @name Features_Importance
Features_Importance <- function(preprocessedDFList, featureN=100){
  time.start <- proc.time()

  # Input check
  if(any(class(preprocessedDFList[[1]])=="data.frame")) preprocessedDFList <- lapply(preprocessedDFList, function(d){list("dt"=d)})

  # Tasks
  preprocessedDFList_to_taskList <- function(preprocessedDFList, maxRowN=Inf){
    pbapply::pblapply(1:length(preprocessedDFList), function(i){
      param <- names(preprocessedDFList)[i]
      s <- try(as.integer(rev(unlist(stringr::str_split(param, stringr::fixed("."))))[1]), silent=T)
      if(any(class(s)=="try-error")) s <- 123456789  ## ad hoc seed
      set.seed(s)
      dt <- data.table::as.data.table(preprocessedDFList[[i]][["dt"]])
      dt <- dt[DataType=="Train",]
      dt[,DataType:=NULL][,Peptide:=NULL][,Cluster:=NULL]
      if(nrow(dt)>maxRowN) dt <- dplyr::slice(dt, sample(1:nrow(dt), maxRowN))
      dt <- as.data.frame(dt)
      target <- dt$"Immunogenicity"
      tab <- as.numeric(table(target))
      w <- 1/tab[target]
      tsk <- mlr::makeClassifTask(data=dt, target="Immunogenicity", weights=w)
      return(tsk)
    })
  }
  message("Converting preprocessed dataframes into a list of tasks...")
  taskSet <- preprocessedDFList_to_taskList(preprocessedDFList, maxRowN=Inf)
  gc();gc()

  # Importances
  message("Calculating feature importances...")
  featureImportances <- pbapply::pblapply(taskSet, mlr::generateFilterValuesData, method="randomForest_importance")
  gc();gc()
  featureImportanceDFList <- lapply(1:length(preprocessedDFList), function(i){
    param <- names(preprocessedDFList)[i]
    imp <- featureImportances[[i]][["data"]] %>%
      dplyr::transmute(FeatureID=name, Importance=scales::rescale(value, to=c(0,1)), Parameter=param) %>%
      DescTools::Sort(ord="Importance", decreasing=T)
    if(!is.null(featureN)){
      feat <- as.character(dplyr::slice(imp, seq(1, min(nrow(imp), featureN)))[["FeatureID"]])
    }else{
      feat <- as.character(imp[["FeatureID"]])
    }
    dt <- data.table::as.data.table(preprocessedDFList[[i]][["dt"]])
    dt <- dt[, c("DataType", "Peptide", "Immunogenicity", "Cluster", feat), with=F]
    gc();gc()
    list("dt"=dt, "feat"=feat, "imp"=imp)
  })
  names(featureImportanceDFList) <- names(preprocessedDFList)
  time.end <- proc.time()
  message("Overall time required = ", format((time.end-time.start)[3], nsmall=2), "[sec]")
  return(featureImportanceDFList)
}

#' @export
#' @rdname Features_Importance
#' @name Features_Importance
Features_Importance_Reduce <- function(featureImportanceDFList, featureN=30){
  lapply(featureImportanceDFList, function(l){
    feat <- l$feat[1:featureN]
    dt <- l$dt[, c("DataType", "Peptide", "Immunogenicity", "Cluster", feat), with=F]
    gc();gc()
    list("dt"=dt, "feat"=feat, "imp"=l$imp)
  })
}
masato-ogishi/Repitope documentation built on Feb. 14, 2023, 5:47 a.m.