R/AutoBin_Binary.R
In CASMI: 'CASMI'-Based Functions

Documented in autoBin.binary

# return suggested threshold for a quantitative variable
autoBinThresh <- function(data, colIndex){ # the column index of a single x that needs auto binning

  finalCut <- NA

  if (colIndex >= ncol(data) || colIndex <= 0) {
    return(finalCut)
  }

  # deal with NAs, equal values, and other problems
  if ( (sum(!is.na(data[,colIndex])) <= 1) || (all(na.omit(data[,colIndex]) == na.omit(data[,colIndex])[1])) || !is.numeric(data[,colIndex]) ) {
    return(finalCut)
  }


  XY <- na.omit( data[ , c(colIndex, ncol(data))] ) # subset for na.omit

  # get all possible cutting points
  cuts <- sort(unique(XY[,1]))
  cuts <- cuts[-length(cuts)] # excluding the last value

  # get the first optimal cutting point based on MI
  maxMI <- 0
  for(value in cuts){
    tmpMI <- MI.z(table(cut(XY[,1], breaks = c(-Inf, value, Inf), labels = c("L", "H"), right = TRUE), XY[,2]))
    if(tmpMI > maxMI){
      finalCut <- value
      maxMI <- tmpMI
    }
  }
  return(finalCut)

}

#' Automatically Dichotomize Quantitative Variables
#'
#' Automatically compute optimal cutting points (based on mutual information) to dichotomize quantitative variables. This function can be used as a pre-processing step before using the \pkg{CASMI}-based functions.
#' @param data data frame with variables as columns and observations as rows. The outcome variable (Y) MUST be categorical or discrete. The outcome variable (Y) MUST be the last column.
#' @param index index or a vector of indices of the quantitative features (a.k.a., predictors, factors, independent variables) that need to be automatically categorized.
#' @return `autoBin.binary()` returns the entire data frame after automatically dichotomizing the selected quantitative variable(s).
#' @examples
#' ## Using the "iris" dataset embedded in R
#' data("iris")
#' head(iris) # The original data
#'
#' # ---- Dichotomize One Single Feature ----
#' # Dichotomize the column with index 1.
#' newData1 <- autoBin.binary(iris, 1)
#' head(newData1)
#'
#' # ---- Dichotomize Multiple Features at a Time ----
#' # Dichotomize the columns with indices 1, 2, 3, and 4.
#' newData2 <- autoBin.binary(iris, c(1,2,3,4))
#' head(newData2)
#'
#' # ---- Dichotomize Features Using Column Names ----
#' # Dichotomize the columns with the names "Sepal.Length" and "Sepal.Width".
#' cols_of_interest <- c("Sepal.Length", "Sepal.Width")
#' col_indices <- which(names(iris) %in% cols_of_interest)
#' newData3 <- autoBin.binary(iris, col_indices)
#' head(newData3)
#'
#' @importFrom EntropyEstimation MI.z
#' @importFrom stats na.omit
#'
#' @export


# return the finalized data frame after auto binning
autoBin.binary <- function(data, index){

  if (!is.data.frame(data)) {
    stop("Error: The input is not of data frame type.")
  }

  data <- as.data.frame(data)

  # Check if the inputs are of correct type
  if (!is.numeric(index)) {
    stop("Error: The 'index' input must be an index or a vector of indices.")
  }

  for(i in 1:length(index)){
    colIndex <- index[i]
    finalCut <- autoBinThresh(data, colIndex)
    if(!is.na(finalCut)){
      data[,colIndex] <- cut(data[,colIndex], breaks = c(-Inf, finalCut, Inf), right = TRUE) # cut the original variable by the threshold

      # Rename the column to "previous_column_name_Binned"
      original_name <- names(data)[colIndex]
      new_name <- paste0(original_name, "_Binned")
      names(data)[colIndex] <- new_name
    } else {
      sentence1 <- "The automatic categorization is not performed for this variable -- Column Index: "
      sentence1 <- paste0(sentence1, colIndex)
      sentence2 <- " -- due to the following possible issues:
      (1) The column is not of a numeric type;
      (2) The column index is not valid;
      (3) There are no values or only one distinct value in this variable;
      or (4) This variable is independent of the outcome based on Mutual Information, so no optimal cut can be found."
      sentence1 <- paste0(sentence1, sentence2)
      warning(sentence1)
    }
  }

  return(as.data.frame(data))
}