R/RandomForestWrapper.R

#' @title Wrapper for RF Runs
#' @description
#'
#' Convenience wrapper for batch RF runs
#'
#' @param df Input dataframe
#' @param y Name of response variable
#' @param incl Variables to include
#' @param prov Cleland province to include
#'
#' @details
#'
#' If `prov` is not NULL, then `Cleland_province` must be present as a column
#' in the input dataframe.
#'
#' @examples RandomForestWRapper(df = mydf)
#' @export
RandomForestWrapper <- function(
  df, y = df[, 1], incl = colnames(df), prov = NULL,
  ntree = 500, mtry = NULL, ret = 'rf', plot = F
  ) {

  # Prepare input df:
  in_df <- RSFIA::PrepDataForModels(df, cc = T, char_to_fac = T,
                                    max_lvl = 53, incl = c(incl, y))
  if (length(prov) > 0) {
    in_df <- in_df[which(in_df$Cleland_province %in% prov), ]
  }
  y_in <- in_df[[y]]
  in_df <- in_df[, -which(colnames(in_df) == y)]

  # Set RF parameters:
  if (length(mtry) < 1) {
    if (!is.null(y_in) && !is.factor(y_in)) {
      mtry <- max(floor(ncol(in_df) / 3), 1)
    } else {
      mtry <- floor(sqrt(ncol(in_df)))
    }
  } else {
    if (mtry > ncol(in_df)) {
      mtry <- ncol(in_df)
      warning('mtry input too large, set to bagging')
    }
  }

  # Run RF:
  out_rf <- randomForest::randomForest(
    x = in_df, y = y_in, do.trace = T, mtry = mtry, ntree = ntree
    )

  # Return plots and rf:
  out_pred <- predict(out_rf, se.fit = T)
  if (plot == T) {
    randomForest::varImpPlot(out_rf)
    plot(out_rf$y, out_pred, xlab = 'Observed Y', ylab = 'Predicted Y')
  }
  r2 <- round(summary(lm(out_pred ~ y_in))$r.squared, 2)
  if (ret == 'rf') {
    message('Predicted vs observed r-squared:')
    print(r2)
    return(invisible(out_rf))
  } else if (ret == 'r2') {
    return(r2)
  }
}
bmcnellis/RSFIA documentation built on June 1, 2019, 7:40 a.m.