R/rfEAT.R

Defines functions print.RFEAT RFEAT RandomEAT

Documented in RandomEAT RFEAT

#' @title Individual EAT for Random Forest
#'
#' @description This function builds an individual tree for Random Forest
#'
#' @param data \code{data.frame} containing the training set.
#' @param x Vector. Column input indexes in data.
#' @param y Vector. Column output indexes in data.
#' @param numStop Minimum number of observations in a node for a split to be attempted.
#' @param s_mtry Number of variables randomly sampled as candidates at each split. The available options are: \code{"BRM"}, \code{"DEA1"}, \code{"DEA2"}, \code{"DEA3"}, \code{"DEA4"} or any integer.
#' 
#' @return A \code{list} of m trees in forest and the error that will be used in the ranking of the importance of the variables.
RandomEAT <- function(data, x, y, numStop, s_mtry){
  
  # Size data
  N <- nrow(data)
  
  # Size 'x' and 'y'
  nX <- length(x)
  nY <- length(y)
  
  #t node
  t <- list("id" = 1,
            "F" = -1,
            "SL" = -1,
            "SR" = -1,
            "index" = data[["id"]],
            "R" = -1,
            "xi" = -1,
            "s" = -1,
            "y" = apply(data[, y, drop = F], 2, max) %>%
                    unname() %>%
                    as.list(),
            "a" = apply(data[, x, drop = F], 2, min) %>% 
              unname(),
            "b" = rep(Inf, nX)
  )
  
  t[["R"]] <- mse(data, t, y)
  
  # Tree
  tree <- list(t)
  
  # List of leaf nodes
  leaves <- list(t)
  N_leaves <- length(leaves)
  
  # Build tree
  while(N_leaves != 0){
    t <- leaves[[N_leaves]]
    leaves[[N_leaves]] <- NULL # Drop t selected
    if(isFinalNode(t[["index"]], data[, x], numStop)) break
    
    mtry <- select_mtry(s_mtry, t, nX, nY)
    # Randomly select k (<P) of the original predictors
    # Select random columns by index
    
    # Select variables
    arrayK <- mtry_inputSelection(data, x, t, mtry)
    
    tree_leaves <- split_forest(data, tree, leaves, t, x, y, numStop, arrayK)
    
    tree <- tree_leaves[[1]]
    leaves <- tree_leaves[[2]]
    N_leaves <- length(leaves)
  }
  
  leaves <- NULL
  
  return(tree)
}

#' @title Random Forest + Efficiency Analysis Trees
#'
#' @description This function builds \code{m} individual Efficiency Analysis Trees in a forest structure.
#' 
#' @param data \code{data.frame} or \code{matrix} containing the variables in the model.
#' @param x Column input indexes in data.
#' @param y Column output indexes in data.
#' @param numStop Minimum number of observations in a node for a split to be attempted.
#' @param m Number of trees to be built.
#' @param s_mtry Number of variables randomly sampled as candidates at each split. The available options are:
#' \itemize{
#' \item{\code{"BRM"}}: \code{in / 3}
#' \item{\code{"DEA1"}}: \code{(t.obs / 2) - out}  
#' \item{\code{"DEA2"}}: \code{(t.obs / 3) - out}
#' \item{\code{"DEA3"}}: \code{t.obs - 2 * out}
#' \item{\code{"DEA4"}}: \code{min(t.obs / out, (t.obs / 3) - out)}
#' \item{Any integer}
#' }
#' @param na.rm \code{logical}. If \code{TRUE}, \code{NA} rows are omitted.
#'
#' @importFrom dplyr %>% row_number
#' 
#' @examples 
#' \donttest{
#' simulated <- X2Y2.sim(N = 50, border = 0.1)
#'
#' RFmodel <- RFEAT(data = simulated, x = c(1,2), y = c(3, 4), numStop = 5,
#'                   m = 50, s_mtry = "BRM", na.rm = TRUE)
#' }
#' 
#' @return A \code{RFEAT} object containing:
#' \itemize{
#'   \item{\code{data} \itemize{
#'                       \item{\code{df}}: data frame containing the variables in the model.
#'                       \item{\code{x}}: input indexes in data.
#'                       \item{\code{y}}: output indexes in data.
#'                       \item{\code{input_names}}: input variable names.
#'                       \item{\code{output_names}}: output variable names.
#'                       \item{\code{row_names}}: rownames in data.}
#'        }
#'   \item{\code{control} \itemize{
#'                         \item{\code{numStop}}: numStop hyperparameter value.
#'                         \item{\code{m}}: m hyperparameter value.
#'                         \item{\code{s_mtry}}: s_mtry hyperparameter value.
#'                         \item{\code{na.rm}}: na.rm hyperparameter value.}
#'        }
#'   \item{\code{forest}: \code{list} containing the individual EAT models.}
#'   \item{\code{error}: Out-of-Bag error at the forest.}   
#'   \item{\code{OOB}: \code{list} containing Out-of-Bag set for each tree.}
#' }
#' 
#' @export
RFEAT <- function(data, x, y, numStop = 5, m = 50, 
                  s_mtry = "BRM", na.rm = TRUE){
  conflict_prefer("filter", "dplyr")
  
  # Transform character to number (if it's possible)
  # Numbers accepted
  # If character, available option
  if (!is.na(suppressWarnings(as.numeric(s_mtry)))){
    s_mtry <- as.numeric(s_mtry)
    
  } else if (!s_mtry %in% c("BRM", "DEA1", "DEA2", "DEA3", "DEA4")) {
    stop(paste(s_mtry, "is not available. Plase, cheack help(\"RFEAT\")"))
  }
  
  # m argument bad introduced
  if (m < 0) {
    stop(paste('m =', m, 'must be greater than or equal 1.'))
  }
  
  # Rownames
  rwn <- row.names(data)
  
  data <- preProcess(data = data, x = x, y = y, numStop = numStop, na.rm = na.rm)

  data <- data %>%
    mutate(id = row_number())
  
  # Reorder index 'x' and 'y' in data
  x <- 1:((ncol(data) - 1) - length(y))
  y <- (length(x) + 1):(ncol(data) - 1)
  
  # DMUs
  N <- nrow(data)
  nY <- length(y)
  
  # Forest error
  err <- 0
  
  # Forest list
  forest <- rep(list(list()), m)
  forestArray <- rep(list(list()), m)
  
  for(i in 1:m){
    df_train_arr_test <- bagging(data, x, y)
    df_train <- df_train_arr_test[[1]]
    arr_test <- df_train_arr_test[[2]]
    
    forestArray[[i]] <- arr_test
    
    # Train a tree model on this sample -> EAT
    tree <- RandomEAT(df_train, x, y, numStop, s_mtry)
    forest[[i]] <- tree
  }
  
  # arr_test is a list with m elements.
  # each element of arr_test is a list with N elements.
  # each element of the sublist is a binary value that indicates if the DMU is in the training sample.

  # TEST
  for(i in 1:N){
    reg_i <- data[i, ]
    
    y_EstimArr <- rep(list(0), nY)
    
    # Cardinal Ki
    Ki <- 0

    for(k in 1:m){ #k in Ki
      if(forestArray[[k]][[i]]){
        Ki <- Ki + 1
        y_EstimArr <- mapply("+", y_EstimArr,  predictor(forest[[k]], reg_i[x]))
      }
    }
    
    # y_EstimArr is the mean prediction for each output. If all y_EstimArr = 0
    # this observations is not used for prediction
    if(all(sapply(y_EstimArr, identical, 0)))
      next
    err <- err + sum((reg_i[y] - (y_EstimArr / Ki)) ^ 2)
  }
  err <- err/N
  
  RFEAT <- RFEAT_object(data, x, y, rwn, numStop, m, s_mtry, na.rm, forest, err,
                        forestArray)
  
  return(RFEAT)
}

#' @export
print.RFEAT <- function(x, ...) {
  
  input_names <- x[["data"]][["input_names"]]
  output_names <- x[["data"]][["output_names"]]
  
  cat("\n",
      paste(" Formula: "),
      do.call(paste, c(as.list(output_names), sep = " + ")), 
      "~",
      do.call(paste, c(as.list(input_names), sep = " + ")),
      "\n"
  )
  
  cat(
    rep("\n", 1),
    "# ========================== #", "\n",
    "#           Forest           #", "\n",
    "# ========================== #", 
    rep("\n", 2) 
  )
  
  cat(" Error: ", round(x[["error"]], 2), "\n",
      " numStop: ", x[["control"]][["numStop"]],  "\n",
      " No. of trees (m): ", x[["control"]][["m"]], "\n",
      " No. of inputs tried (s_mtry): ", x[["control"]][["s_mtry"]],
      sep = "")
  
}

Try the eat package in your browser

Any scripts or data that you put into this service are public.

eat documentation built on Jan. 10, 2023, 5:13 p.m.