View source: R/feature_search.R
| feature_search | R Documentation | 
A convenience wrapper for greedy and exhaustive feature selection algorithms that
extract valuable attributes depending on the evaluation method (called evaluator). This function
is a reimplementation of FSelector's exhaustive.search and greedy.search.
feature_search(
  attributes,
  fun,
  data,
  mode = c("greedy", "exhaustive"),
  type = c("forward", "backward"),
  sizes = 1:length(attributes),
  parallel = TRUE,
  ...
)
| attributes | A character vector with attributes' names to be used to extract the most valuable features. | 
| fun | A function (evaluator) to be used to score features' sets at each iteration of the algorithm passed via  | 
| data | A data set for  | 
| mode | A character that determines which search algorithm to perform. Defualt is  | 
| type | Used when  | 
| sizes | Used when  | 
| parallel | Allow parallelization. | 
| ... | Other arguments passed to foreach function. | 
The evaluator function passed with fun is used to determine
the importance score of current features' subset.
The score is used in a multiple-way (backward or forward) greedy
algorithm as a stopping moment or as a selection criterion
in the exhaustive search that checks all possible
attributes' subset combinations (of sizes passed in sizes).
A list with following components
best - a data.frame with the best subset and it's score (1 - feature used, 0 - feature not used),
all - a data.frame with all checked features' subsets and their score (1 - feature used, 0 - feature not used),
data - the data used in the feature selection,
fun - the evaluator used to compute the score of importance for features' subsets,
 call - an origin call of the feature_search,
mode - the mode used in the call.
Note that score depends on the evaluator you provide in the fun parameter.
Zygmunt Zawadzki zygmunt@zstat.pl
Krzysztof Slomczynski krzysztofslomczynski@gmail.com
# Enable parallelization in examples
## Not run: 
 library(doParallel)
 cl <- makeCluster(2)
 registerDoParallel(cl)
## End(Not run)
# Close at the end
# stopCluster(cl) #nolint
# registerDoSEQ() #nolint
# 1) Evaluator from FSelector package.
evaluator <- function(subset, data, dependent = names(iris)[5]) {
  library(rpart)
  k <- 5
  splits <- runif(nrow(data))
  results <- sapply(1:k, function(i) {
    test.idx <- (splits >= (i - 1) / k) & (splits < i / k)
    train.idx <- !test.idx
    test <- data[test.idx, , drop = FALSE]
    train <- data[train.idx, , drop = FALSE]
    tree <- rpart(to_formula(subset, dependent), train)
    error.rate <- sum(test[[dependent]] != predict(tree, test, type = "c")) /
    nrow(test)
    return(1 - error.rate)
  })
  return(mean(results))
}
set.seed(123)
# Default greedy search.
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris)
)
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris,
                 parallel = FALSE)
)
# Optional exhaustive search.
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris,
                 mode = "exhaustive")
)
system.time(
  feature_search(attributes = names(iris)[-5],
                 fun = evaluator,
                 data = iris,
                 mode = "exhaustive",
                 parallel = FALSE)
)
# 2) Maximize R^2 statistics in the linear regression model/problem.
evaluator_R2_lm <- function(attributes, data, dependent = names(iris)[1]) {
  summary(
    lm(to_formula(attributes, dependent), data = data)
  )$r.squared
}
feature_search(attributes = names(iris)[-1],
               fun = evaluator_R2_lm, data = iris,
               mode = "exhaustive")
# 3) Optimize BIC crietion in generalized linear model.
# Aim of Bayesian approach it to identify the model with the highest
# probability of being the true model. - Kuha 2004
utils::data(anorexia, package = "MASS")
evaluator_BIC_glm <- function(attributes, data, dependent = "Postwt") {
  extractAIC(
    fit = glm(to_formula(attributes, dependent), family = gaussian,
              data = data),
    k = log(nrow(data))
  )[2]
}
feature_search(attributes = c("Prewt", "Treat", "offset(Prewt)"),
               fun = evaluator_BIC_glm,
               data = anorexia,
               mode = "exhaustive")
# Close parallelization
## Not run: 
stopCluster(cl)
registerDoSEQ()
## End(Not run)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.