R/deprecated_PrepDataForModels.R

#' @title Cleans up/preps dataframes for use in statistical model function
#' @description NA
#'
#' @param df Input dataframe
#' @param incl Which variables to include?
#' @param cc Only include complete cases?
#' @param char_to_fac Convert all characters to factors?
#' @param max_lvl Max # of factor levels? Useful for RF analysis
#' @param num_only Exclude all non-numeric variables?
#' @param vct Fractional threshold of non-NA values required
#' to include a variable.
#'
#' @details
#'
#' `vct` executes before `cc`, allowing you to drop mostly empty columns before
#' dropping plots with incomplete data.
#'
#' @export
#' @examples PrepDataForModels('FIA_Mort_DF')
deprecated_PrepDataForModels <- function(df, incl = colnames(df), cc = F, char_to_fac = T,
                              max_lvl = 53, num_only = F, vct = 0.9) {
  # Setup:
  if (!is.data.frame(df)) stop('Input must be a data frame')
  out_df <- df

  # Execute options:
  out_df <- out_df[, which(colnames(out_df) %in% incl)]
  if (ncol(out_df) < 2) stop('Error subsetting columns - named correctly?')
  if (num_only) {
    col_class <- unlist(lapply(out_df, class))
    out_df <- out_df[, which(col_class %in% c('integer', 'numeric'))]
  }
  vct_incl <- unlist(lapply(out_df, function(x) {
    y <- (sum(is.na(x)) / length(x)) < (1 - vct)
    return(y)
  }))
  out_df <- out_df[, vct_incl]
  if (cc) {
    which_NA <- which(rowSums(is.na(out_df)) > 0)
    if (length(which_NA) > 0) out_df <- out_df[-which_NA, ]
  }
  if (char_to_fac) {
    out_df <- data.frame(lapply(out_df, function(x) {
      if (is.character(x)) {
        y <- as.factor(x)
      } else {
        y <- x
      }
      return(y)
    }))
  }
  which_max_lvl <- unlist(lapply(out_df, function(x) {
    if (is.factor(x)) {
      y <- length(levels(x)) > max_lvl
      return(y)
    }
  }))
  if (sum(which_max_lvl) > 0) {
    out_cols <- names(which(which_max_lvl == T))
    out_df <- out_df[, -which(colnames(out_df) %in% out_cols)]
  }

  # Return:
  if (ncol(out_df) < 1) stop('Dropped all columns')
  cat('Rows dropped:', nrow(df) - nrow(out_df),
      '\nCols dropped:', ncol(df) - ncol(out_df), '\n')
  return(out_df)
}
bmcnellis/RSFIA documentation built on June 1, 2019, 7:40 a.m.