R/morf.R
In morf: Modified Ordered Random Forest

Documented in morf

#' Modified Ordered Random Forest
#' 
#' Nonparametric estimator of the ordered choice model using random forests. The estimator modifies a standard random forest
#' splitting criterion to build a collection of forests, each estimating the conditional probability of a single class.
#'
#' @param y Outcome vector.
#' @param X Covariate matrix (no intercept).
#' @param n.trees Number of trees.
#' @param alpha Controls the balance of each split. Each split leaves at least a fraction \code{alpha} of observations in the parent node on each side of the split.
#' @param honesty Whether to grow honest forests.
#' @param honesty.fraction Fraction of honest sample. Ignored if \code{honesty = FALSE}.
#' @param inference Whether to extract weights and compute standard errors. The weights extraction considerably slows down the routine. \code{honesty = TRUE} is required for valid inference.
#' @param mtry Number of covariates to possibly split at in each node. Default is the square root of the number of covariates.
#' @param min.node.size Minimal node size.
#' @param max.depth Maximal tree depth. A value of 0 corresponds to unlimited depth, 1 to "stumps" (one split per tree).
#' @param replace If \code{TRUE}, grow trees on bootstrap subsamples. Otherwise, trees are grown on random subsamples drawn without replacement. 
#' @param sample.fraction Fraction of observations to sample. 
#' @param n.threads Number of threads. Zero corresponds to the number of CPUs available.
#' 
#' @return 
#' Object of class \code{morf}.
#' 
#' @examples 
#' ## Load data from orf package.
#' set.seed(1986)
#' 
#' library(orf)
#' data(odata)
#' odata <- odata[1:200, ] # Subset to reduce elapsed time.
#' 
#' y <- as.numeric(odata[, 1])
#' X <- as.matrix(odata[, -1])
#' 
#' ## Training-test split.
#' train_idx <- sample(seq_len(length(y)), floor(length(y) * 0.5))
#' 
#' y_tr <- y[train_idx]
#' X_tr <- X[train_idx, ]
#' 
#' y_test <- y[-train_idx]
#' X_test <- X[-train_idx, ]
#' 
#' ## Fit morf on training sample.
#' forests <- morf(y_tr, X_tr)
#' 
#' ## We have compatibility with generic S3-methods.
#' print(forests)
#' summary(forests)
#' predictions <- predict(forests, X_test)
#' head(predictions$probabilities)
#' table(y_test, predictions$classification)
#' 
#' \donttest{
#' ## Compute standard errors. This requires honest forests.
#' honest_forests <- morf(y_tr, X_tr, honesty = TRUE, inference = TRUE)
#' head(honest_forests$predictions$standard.errors)}
#' 
#' @import utils stats orf
#' @importFrom Rcpp evalCpp
#' @useDynLib morf
#' 
#' @seealso \code{\link{marginal_effects}}
#' 
#' @author Riccardo Di Francesco
#' 
#' @export
morf <- function(y = NULL, X = NULL,
                 honesty = FALSE, honesty.fraction = 0.5, inference = FALSE, alpha = 0,
                 n.trees = 2000, mtry = ceiling(sqrt(ncol(X))), min.node.size = 5, max.depth = 0, 
                 replace = FALSE, sample.fraction = ifelse(replace, 1, 0.5), n.threads = 1) {
  ## 0.) Defaults for variables not needed.
  splitrule.num <- 1; treetype <- 3; probability <- FALSE
  importance.mode <- 1; scale.permutation.importance <- FALSE; local.importance <- FALSE
  respect.unordered.factors <- "ignore"; unordered.factor.variables <- c("0", "0"); use.unordered.factor.variables <- FALSE
  order.snps <- FALSE; prediction.mode <- FALSE; predict.all <- FALSE; prediction.type <- 1; oob.error <- FALSE
  loaded.forest <- list(); write.forest <- TRUE; snp.data <- as.matrix(0); class.weights <- rep(1, nlevels(y))
  alpha_balance <- alpha; alpha <- 0.5; minprop <- 0.1; num.random.splits <- 1; use.regularization.factor <- FALSE; 
  regularization.factor <- c(0, 0); regularization.usedepth <- FALSE; case.weights <- c(0, 0); use.case.weights <- FALSE; 
  split.select.weights <- list(c(0, 0)); use.split.select.weights <- FALSE; always.split.variables <- c("0", "0"); 
  use.always.split.variables <- FALSE; inbag <- list(c(0 ,0)); use.inbag <- FALSE; keep.inbag <- FALSE; holdout <- FALSE; 
  save.memory <- FALSE; verbose <- TRUE; seed <- stats::runif(1 , 0, .Machine$integer.max)
  
  ## 1.) Handling inputs and checks.
  # 1a.) Generic checks.
  check_x_y(X, y)
  check_honesty_inference(honesty, honesty.fraction, inference)
  check_ntrees(n.trees)
  check_alpha(alpha)
  mtry <- check_mtry(mtry, colnames(X))
  check_minnodesize(min.node.size)
  check_maxdepth(max.depth)
  check_samplefraction(sample.fraction)
  
  # 1b.) Handle factor outcomes.
  if (is.factor(y)) y <- as.numeric(y)
  
  # 1c.) Store useful variables.
  y.classes <- sort(unique(y))
  n.classes <- length(y.classes)
  
  # 1d.) Recode characters as factors.
  covariate.levels <- NULL
  if (!is.matrix(X) && !inherits(X, "Matrix") && ncol(X) > 0) {
    character.idx <- sapply(X, is.character)
    X[character.idx] <- lapply(X[character.idx], factor)
    
    if (any(sapply(X, is.factor))) {
      covariate.levels <- lapply(X, levels)
    }     
  }
  
  # 1e.) Error if no covariates (X must have colnames).
  independent.variable.names <- colnames(X)
  all.independent.variable.names <- independent.variable.names
  if (length(all.independent.variable.names) < 1) stop("No covariates found. Maybe 'X' is missing? colnames?", call. = FALSE)
  
  ## 2.) Handling training data.
  # 2a.) Honest split if necessary. 
  if (honesty) {
    honest_split <- class_honest_split(data.frame(y, X), honesty.fraction)
    
    train_sample <- honest_split$train_sample
    honest_sample <- honest_split$honest_sample
    colnames(train_sample) <- c("y", independent.variable.names)
    colnames(honest_sample) <- c("y", independent.variable.names)
    
    y_train <- train_sample[, 1]
    x_train <- as.data.frame(train_sample[, -1])
    colnames(x_train) <- independent.variable.names
    
    y_honest <- honest_sample[, 1]
    x_honest <- as.data.frame(honest_sample[, -1])
    colnames(x_honest) <- independent.variable.names
  } else { 
    train_sample <- data.frame(y, X)
    honest_sample <- list()
    colnames(train_sample) <- c("y", independent.variable.names)
    
    y_train <- train_sample[, 1]
    x_train <- as.data.frame(train_sample[, -1])
    colnames(x_train) <- independent.variable.names
    
    y_honest <- NULL
    x_honest <- NULL
  }
  
  # 2b.) Transform training data into sparse matrix.
  if (inherits(x_train, "dgCMatrix")) {
    sparse.x <- x_train
    x_train <- matrix(c(0, 0))
    use.sparse.data <- TRUE
  } else {
    sparse.x <- Matrix::Matrix(matrix(c(0, 0)))
    use.sparse.data <- FALSE
    
    if (is.data.frame(x_train)) {
      x_train <- data.matrix(x_train)
    }
  }
  
  ## 3.) Generating binary outcomes for each class. The m-th element stores the indicator variables relative to the m-th class.
  train_outcomes <- list()
  honest_outcomes <- list()
  counter <- 1
  for (m in y.classes) {
    train_outcomes[[counter]] <- data.frame("y_m_train" = ifelse(y_train <= m, 1, 0), "y_m_1_train" = ifelse(y_train <= m -1, 1, 0))
    honest_outcomes[[counter]] <- data.frame("y_m_honest" = ifelse(y_honest <= m, 1, 0), "y_m_1_honest" = ifelse(y_honest <= m -1, 1, 0))
    counter <- counter + 1
  }
  
  ## 4.) Fitting a modified ordered forest to each pair of binary outcomes.
  forest_output <- lapply(train_outcomes, function(x) {
    morfCpp(treetype, x_train, matrix(c(as.numeric(x$y_m_1_train), as.numeric(x$y_m_train)), ncol = 2), 
            independent.variable.names, mtry, n.trees, verbose, seed, n.threads, write.forest, importance.mode, min.node.size, 
            split.select.weights, use.split.select.weights, always.split.variables, use.always.split.variables, 
            prediction.mode, loaded.forest, snp.data, replace, probability, unordered.factor.variables, 
            use.unordered.factor.variables, save.memory, splitrule.num, case.weights, use.case.weights, class.weights, 
            predict.all, keep.inbag, sample.fraction, alpha, minprop, holdout, prediction.type, num.random.splits, sparse.x, 
            use.sparse.data, order.snps, oob.error, max.depth, inbag, use.inbag, regularization.factor,
            use.regularization.factor, regularization.usedepth, alpha_balance)})
  if (any(sapply(forest_output, function(x) {length(x) == 0}))) stop("User interrupt or internal error.", call. = FALSE)
  
  ## 5.) Handling forests' output.
  # 5a.) Names for variable importance.
  forest_output <- lapply(forest_output, function(x) {
    names(x$variable.importance) <- all.independent.variable.names
    x})
  
  # 5b.) Write forest objects.
  if (is.factor(y_train)) {
    forest_output <- lapply(forest_output, function(x) {
      x$forest$levels <- levels(y_train)
      x})
  }
  forest_output <- lapply(forest_output, function(x) {
    x$forest$covariate.names <- independent.variable.names
    x$forest$treetype <- "modified.ordered"
    x})
  if (!is.null(covariate.levels)) {
    forest_output <- lapply(forest_output, function(x) {
      x$forest$covariate.levels <- covariate.levels
      x})
  }
  forests <- lapply(forest_output, function(x) {x$forest})
  forests <- lapply(forests, function(x) {
    class(x) <- "morf.forest"
    x})
  names(forests) <- paste("forest", y.classes, sep = ".")
  
  # 5c.) Save and remove unnecessary element.
  n.trees <- lapply(forest_output, function(x) {x$num.trees})[[1]]
  n.covariates <- lapply(forest_output, function(x) {x$num.covariates})[[1]]
  mtry <- lapply(forest_output, function(x) {x$mtry})[[1]]
  min.node.size <- lapply(forest_output, function(x) {x$min.node.size})[[1]]
  overall.importance <- rowMeans(as.matrix(sapply(forest_output, function(x) {x$variable.importance})))
  relative.importance <- round(overall.importance / sum(overall.importance), 3)
  
  ## 6) Predictions. 
  # 6a.) If inference, extract weights and use these to predict. Otherwise, use standard, faster prediction method.
  if (inference) { 
    weights <- lapply(forests, function(x) {forest_weights_fitted(x, honest_sample, train_sample)})
    class.probabilities <- mapply(function(x, y) {x %*% (y$y_m_honest - y$y_m_1_honest)}, weights, honest_outcomes)
    
    n_honest <- dim(honest_sample)[1]
    sample_correction <- n_honest / (n_honest - 1)
    products <- mapply(function(x, y) {t(apply(x, 1, function(z) {z * (y$y_m_honest - y$y_m_1_honest)}))}, weights, honest_outcomes, SIMPLIFY = FALSE)
    sums_squares <- lapply(products, function(x) {rowSums((x - rowMeans(x))^2)})
    variances <- matrix(unlist(lapply(sums_squares, function(x) {sample_correction * x}), use.names = FALSE), ncol = n.classes)
    colnames(variances) <- paste("P(Y=", y.classes, ")", sep = "")
  } else if (honesty) {
    rownames(x_train) <- rownames(honest_split$train_sample)
    rownames(x_honest) <- rownames(honest_split$honest_sample)
    class.probabilities <- mapply(function(x, y) {honest_fitted(x, x_train, x_honest, y$y_m_honest, y$y_m_1_honest)}, forests, honest_outcomes)
    variances <- list()
  } else {
    class.probabilities <- matrix(unlist(lapply(forests, function(x) {predict(x, data = x_train)$predictions}), use.names = FALSE), ncol = n.classes)
    variances <- list()
  } 
  
  # 6b.) Normalization step.
  class.probabilities <- matrix(apply(class.probabilities, 1, function(x) (x)/(sum(x))), ncol = n.classes, byrow = TRUE)
  colnames(class.probabilities) <- paste("P(Y=", y.classes, ")", sep = "")
  
  # 6c.) Classification.
  classification <- apply(class.probabilities, 1, which.max)
  
  ## 7.) Construct morf object.
  output <- list()
  output$forests.info <- forests
  output$predictions <- list("probabilities" = class.probabilities, "standard.errors" = if (inference) sqrt(variances) else list(),
                             "classification" = classification)
  output$importance <- relative.importance
  output$tuning.info <- list("n.trees" = n.trees, "mtry" = mtry, "min.node.size" = min.node.size, "replace" = replace,
                             "honesty" = honesty, "honesty.fraction" = if (honesty) honesty.fraction else 0, "call" = sys.call())
  output$full_data <- data.frame(y, X)
  colnames(output$full_data) <- colnames(train_sample)
  output$honest_data <- if (honesty) data.frame(y_honest, x_honest) else list()
  if (honesty) colnames(output$honest_data) <- colnames(train_sample)

  class(output) <- "morf"
  
  ## 8.) Output.
  return(output)
}