R/CombinationMining.R
In CASMI: 'CASMI'-Based Functions

Documented in CASMI.mineCombination

source("./R/CommonFunctions.R")

# New Helper Functions for CASMI.mineCombination

## validateDF
# Data frame check and conversion if necessary
# Used to make sure the input data is a data frame and has more than one column
validateDF <- function(data) {
  if (!is.data.frame(data)) {
    stop("Error: The input is not of data frame type.")
  }
  data <- as.data.frame(data)
  if (ncol(data) <= 1) {
    stop("Error: Not enough features are available to perform a selection.")
  }
  return(data)
}

## checkOutcomeNA
# Check outcome column for NAs
# Used to make sure the last column (outcome) has no NA values.
checkOutcomeNA <- function(data) {
  if (any(is.na(data[, ncol(data)])))
    stop(
      "Error: The last column contains NA values. Please ensure that the outcome is located in the last column without any NA values present."
    )
}


## processNA
# Handle NA values
# Used to handle NA values in data based on the specified method
processNA <- function(data, NA_handle) {
  if (NA_handle == "stepwise") {
    # Do nothing (stepwise handling)
  } else if (NA_handle == "na.omit") {
    data <- na.omit(data)
  } else if (NA_handle == "NA as a category") {
    data[] <-
      apply(data, 2, function(column)
        ifelse(is.na(column), "=na=", column))
    data <- as.data.frame(data)
  } else {
    stop("Error: Invalid 'NA.handle' value. Please check the document.")
  }
  return(data)
}

## validateParams
# Validate input parameters
# Used to make sure all input parameters are valid
validateParams <- function(intermediate_steps,
                           kappa_star_cap,
                           num_of_comb) {
  if (!is.logical(intermediate_steps) ||
      length(intermediate_steps) != 1)
    stop("Error: 'intermediate.steps' must be either TRUE or FALSE.")
  if (!is.numeric(kappa_star_cap) ||
      kappa_star_cap < 0 || kappa_star_cap > 1)
    stop("Error: 'kappa.star.cap' must be between 0 and 1.")
  if (!is.numeric(num_of_comb) || num_of_comb < 1)
    stop("Error: 'NumOfComb' must be at least 1.")
}

# Statistical metrics calculation function
# Used to handle the repetitive calculations for statistical metrics

#' @importFrom stats ftable
calcStats <- function(data, indexCASMI, alpha) {
  index <- c(indexCASMI, ncol(data))
  data1 <- data[, index, drop = FALSE]
  tb1 <- ftable(table(data1))
  XYtable <- ftable2xytable(tb1)
  Ymargin <- colSums(XYtable)

  mi_z <- MI.z(XYtable)
  smi_z <- mi_z / Entropy.z(Ymargin)

  low <- smi_z - E.CI.smi(XYtable, alpha)
  uppr <- smi_z + E.CI.smi(XYtable, alpha)

  n <- sum(XYtable)
  K1 <- nrow(XYtable)
  K2 <- ncol(XYtable)

  p_mi_z <- pchisq(2 * n * mi_z + (K1 - 1) * (K2 - 1),
                   df = (K1 - 1) * (K2 - 1),
                   lower.tail = FALSE)

  return(list(
    SMIz = smi_z,
    SMIz.Low = low,
    SMIz.Upr = uppr,
    p.MIz = p_mi_z
  ))
}


## resultsSelect
# Function for CASMI.selectFeatures
# Used to handle the scenario where NumOfVar is not provided
resultsSelect <- function(data,
                          NA_handle,
                          alpha_ind,
                          alpha,
                          intermediate_steps,
                          kappa_star_cap) {
  result <- CASMI.selectFeatures(
    data = data,
    NA.handle = NA_handle,
    alpha.ind = alpha_ind,
    alpha = alpha,
    intermediate.steps = intermediate_steps,
    kappa.star.cap = kappa_star_cap
  )

  indexCASMI <- result$Results$Var.Idx
  namesCASMI <- result$Results$Var.Name

  if (length(indexCASMI) == 0) {
    stop(
      "No features were identified as being associated with the outcome. Please ensure that the outcome variable is located in the last column and that the data has been appropriately preprocessed."
    )
  }

  kappa_star <- result$KappaStar
  kappa_star_ci <- result$KappaStarCI

  metrics <- calcStats(data, indexCASMI, alpha)

  results_df <- data.frame(
    Comb.Idx = paste0("(", paste(indexCASMI, collapse = ", "), ")"),
    n = sum(table(data[, indexCASMI[1]], data[, ncol(data)])),
    KappaStar = sprintf("%.4f", kappa_star),
    KappaStarLow = sprintf("%.4f", kappa_star_ci[1]),
    KappaStarUpr = sprintf("%.4f", kappa_star_ci[2]),
    SMIz = sprintf("%.4f", metrics$SMIz),
    SMIzLow = sprintf("%.4f", metrics$SMIz.Low),
    SMIzUpr = sprintf("%.4f", metrics$SMIz.Upr),
    p.MIz = sprintf("%.4f", metrics$p.MIz),
    Var.Name = paste0("(", paste(namesCASMI, collapse = ", "), ")")
  )

  names(results_df) <- c(
    "Comb.Idx",
    "n",
    "kappa*",
    "kappa*.low",
    "kappa*.upr",
    "SMIz",
    "SMIz.low",
    "SMIz.upr",
    "p.MIz",
    "Var.Name"
  )

  row.names(results_df) <- seq_len(nrow(results_df))

  return(
    list(
      Outcome = names(data)[ncol(data)],
      Conf.Level = 1 - alpha,
      NumOfVar = length(indexCASMI),
      TopResults = results_df
    )
  )
}


## resultsComb
# Function for provided NumOfVar
# Used to handle the scenario where NumOfVar is provided
resultsComb <- function(data,
                        NumOfVar,
                        alpha,
                        alpha.ind,
                        intermediate_steps,
                        NumOfComb) {
  features <- names(data)[1:(ncol(data) - 1)]


  #------ To exclude completely independent features based on alpha.ind
  selected <- sapply(features, function(feature) {
    p_val <- MIz.test(data[[feature]], data[[ncol(data)]])
    p_val <= alpha.ind
  })
  features <- features[selected]

  if (length(NumOfVar) != 1 || NumOfVar < 1 || NumOfVar > length(features)) {
    stop(
      sprintf(
        "Error: 'NumOfVar' must be a positive integer not exceeding %d (which represents the maximum number of potential features in the input dataset, given the specified 'alpha.ind' value and considering the last column as the outcome).",
        length(features)
      )
    )
  }
  #------


  combinations <- combn(features, NumOfVar, simplify = FALSE)

  results_list <- list()
  start_time <- proc.time()

  for (i in seq_len(length(combinations))) {
    comb <- combinations[[i]]
    idxFeatures <- match(comb, names(data))
    kappa_res <- Kappas(data, idxFeatures)

    if (is.null(kappa_res) || is.na(kappa_res$kappa.star))
      next

    tmpIndexFnO <- c(idxFeatures, ncol(data))
    ftable <- ftable(table(data[tmpIndexFnO]))
    tmpXYtable <- ftable2xytable(ftable)

    E <- E.CI.smi(tmpXYtable, alpha)
    kappa_star <- round(kappa_res$kappa.star, 4)

    metrics <- calcStats(data, idxFeatures, alpha)

    elapsed_time <- round((proc.time() - start_time)[3], 2)

    if (intermediate_steps)
      cat(
        sprintf(
          "Processed Combination %d - selected columns (idx.): %s; kappa*: %.4f; elapsed (sec.): %.2f\n",
          i,
          paste(idxFeatures, collapse = ", "),
          kappa_star,
          elapsed_time
        )
      )

    results_list[[i]] <- list(
      Comb.Idx = paste0("(", paste(idxFeatures, collapse = ", "), ")"),
      n = sum(tmpXYtable),
      KappaStar = sprintf("%.4f", kappa_star),
      KappaStarLow = sprintf("%.4f", round(kappa_star - E, 4)),
      KappaStarUpr = sprintf("%.4f", round(kappa_star + E, 4)),
      SMIz = sprintf("%.4f", round(metrics$SMIz, 4)),
      SMIzLow = sprintf("%.4f", round(metrics$SMIz.Low, 4)),
      SMIzUpr = sprintf("%.4f", round(metrics$SMIz.Upr, 4)),
      p.MIz = sprintf("%.4f", metrics$p.MIz),
      Var.Name = paste0("(", paste(names(data)[idxFeatures], collapse = ", "), ")")
    )
  }

  results_df <-
    do.call(rbind, lapply(results_list, function(x)
      unlist(x)))
  results_df <- as.data.frame(results_df, stringsAsFactors = FALSE)
  results_df$KappaStar <- as.numeric(results_df$KappaStar)

  numeric_cols <- sapply(results_df, is.numeric)
  results_df[numeric_cols] <- round(results_df[numeric_cols], 4)

  results_df <- results_df[order(-results_df$KappaStar),]
  results_df <- head(results_df, NumOfComb)

  results_df <- results_df[, c(
    "Comb.Idx",
    "n",
    "KappaStar",
    "KappaStarLow",
    "KappaStarUpr",
    "SMIz",
    "SMIzLow",
    "SMIzUpr",
    "p.MIz",
    "Var.Name"
  )]
  names(results_df) <- c(
    "Comb.Idx",
    "n",
    "kappa*",
    "kappa*.low",
    "kappa*.upr",
    "SMIz",
    "SMIz.low",
    "SMIz.upr",
    "p.MIz",
    "Var.Name"
  )
  results_df <-
    results_df[order(-as.numeric(results_df$`kappa*`)),]
  row.names(results_df) <- seq_len(nrow(results_df))

  return(
    list(
      Outcome = names(data)[ncol(data)],
      Conf.Level = 1 - alpha,
      NumOfVar = NumOfVar,
      TopResults = results_df
    )
  )
}

# CASMI.mineCombination

#' Discover Factor Combinations based on \pkg{CASMI}
#'
#' The `CASMI.mineCombination()` function is designed to suggest combinations of factors that are most strongly associated with the outcome in a dataset. This function is partially developed based on the `CASMI.selectFeatures()` function. (Synonyms for "factor" in this document: "independent variable," "feature," and "predictor.")
#'
#' @param data data frame with variables as columns and observations as rows. The data MUST include at least one feature (a.k.a., independent variable, predictor, factor) and only one outcome variable (Y). The outcome variable MUST BE THE LAST COLUMN. Both the features and the outcome MUST be categorical or discrete. If variables are not naturally discrete, you may preprocess them using the `autoBin.binary()` function in the same package.
#' @param NumOfVar the number of variables in a combination (integer). This setting is optional. If NULL, an automatically suggested number of variables will be returned.
#' @param NA.handle method for handling missing values. This parameter is inherited from the `CASMI.selectFeature()` function. There are three possible options: `NA.handle = "stepwise"` (default), `NA.handle = "na.omit"`, or `NA.handle = "NA as a category"`. Check the `CASMI.selectFeature()` documentation for more details.
#' @param alpha level of significance used for the confidence intervals in the results; the default is 0.05.
#' @param alpha.ind level of significance used for the initial screening of features based on a test of independence; the default is 0.1. This parameter is also used in the `CASMI.selectFeature()` function; check the `CASMI.selectFeature()` documentation for more details.
#' @param intermediate.steps setting for outputting intermediate steps while awaiting the final results. There are two possible settings: `intermediate.steps = TRUE` or `intermediate.steps = FALSE`.
#' @param kappa.star.cap threshold of `kappa*` for halting the feature selection process. This parameter is inherited from the `CASMI.selectFeature()` function; check the `CASMI.selectFeature()` documentation for more details. This setting is applicable only when `NumOfVar` is set to NULL (default).
#' @param NumOfComb the number of top combinations to be returned; the default is 3. This setting is used only when a `NumOfVar` value is defined (not NULL); if `NumOfVar == NULL`, only the automatically suggested combination will be returned.
#' @return `CASMI.mineCombination()` returns the following components:
#' \itemize{
  #'   \item \code{`Outcome`}: Name of the outcome variable (last column) in the input dataset.
  #'   \item \code{`Conf.Level`}: Confidence level used for the results.
  #'   \item \code{`NumOfVar`}: The number of variables in each combination.
  #'   \item \code{`TopResults`}: A results data frame. The number of combinations (rows) returned depends on the `NumOfComb` setting.
  #'   \item \code{`Comb.Idx`}: Indices of the variables in the combination.
  #'   \item \code{`n`}: Number of observations used in the analysis.
  #'   \item \code{`kappa*`}: A comprehensive score reflecting the association between the factor combination and the outcome. A larger `kappa*` indicates that the factor combination has a stronger association with the outcome. For more information about `kappa*`, please refer to the paper: Shi, J., Zhang, J. and Ge, Y. (2019), "\pkg{CASMI}—An Entropic Feature Selection Method in Turing’s Perspective" <doi:10.3390/e21121179>
  #'   \item \code{`kappa*.low`}: Lower bound of the confidence interval for `kappa*`.
  #'   \item \code{`kappa*.upr`}: Upper bound of the confidence interval for `kappa*`.
  #'   \item \code{`SMIz`}: Standardized Mutual Information (SMI) (using the z-estimator) between the factor combination and the outcomes.
  #'   \item \code{`SMIz.low`}: Lower bound of the confidence interval for `SMIz`.
  #'   \item \code{`SMIz.upr`}: Upper bound of the confidence interval for `SMIz`.
  #'   \item \code{`p.MIz`}: P-value between the factor combination and the outcome using the mutual information test of independence based on the z-estimator.
  #'   \item \code{`Var.Name`}: Names of the variables in the combination.
  #' }
#'
#' @examples
#' # ---- Generate a toy dataset for usage examples: "data" ----
#' set.seed(123)
#' n <- 200
#' x1 <- sample(c("A", "B", "C", "D"), size = n, replace = TRUE, prob = c(0.1, 0.2, 0.3, 0.4))
#' x2 <- sample(c("W", "X", "Y", "Z"), size = n, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1))
#' x3 <- sample(c("E", "F", "G", "H", "I"), size = n,
#'              replace = TRUE, prob = c(0.2, 0.3, 0.2, 0.2, 0.1))
#' x4 <- sample(c("A", "B", "C", "D"), size = n, replace = TRUE)
#' x5 <- sample(c("L", "M", "N"), size = n, replace = TRUE)
#' x6 <- sample(c("E", "F", "G", "H", "I"), size = n, replace = TRUE)
#'
#' # Generate y variable dependent on x1 to x3
#' x1_num <- as.numeric(factor(x1, levels = c("A", "B", "C", "D")))
#' x2_num <- as.numeric(factor(x2, levels = c("W", "X", "Y", "Z")))
#' x3_num <- as.numeric(factor(x3, levels = c("E", "F", "G", "H", "I")))
#' # Calculate y with added noise
#' y_numeric <- 3*x1_num + 2*x2_num - 2*x3_num + rnorm(n,mean=0,sd=2)
#' # Discretize y into categories
#' y <- cut(y_numeric, breaks = 10, labels = paste0("Category", 1:10))
#'
#' # Combine into a dataframe
#' data <- data.frame(x1, x2, x3, x4, x5, x6, y)
#'
#' # The outcome of the toy dataset is dependent on x1, x2, and x3
#' # but is independent of x4, x5, and x6.
#' head(data)
#'
#'
#' # ---- Usage Examples ----
#'
#' ## Return the suggested combination with the default settings:
#' CASMI.mineCombination(data)
#'
#' ## Return combinations when the number of variables to be included
#' ## in each combination is specified (e.g., NumOfVar = 2):
#' CASMI.mineCombination(data, NumOfVar = 2)
#'
#' ## Return combinations when the number of variables to be included
#' ## in each combination is specified (e.g., NumOfVar = 2),
#' ## while the number of top combinations to return is specified
#' ## (e.g., NumOfComb = 2):
#' CASMI.mineCombination(data,
#'                      NumOfVar = 2,
#'                      NumOfComb = 2)
#'
#' @importFrom EntropyEstimation Entropy.z MI.z
#' @importFrom entropy entropy.plugin mi.plugin
#' @importFrom stats pchisq qnorm
#' @importFrom utils combn head
#'
#' @export


## Main function: CASMI.mineCombination
CASMI.mineCombination <- function(data,
                                  NumOfVar = NULL,
                                  NA.handle = "stepwise",
                                  alpha = 0.05,
                                  alpha.ind = 0.1,
                                  intermediate.steps = FALSE,
                                  kappa.star.cap = 1.0,
                                  NumOfComb = 3) {
  # Step 0: Checks and Setup
  if (!is.data.frame(data)) {
    stop("Error: The input is not of data frame type.")
  }

  outcome.has.na <- any(is.na(data[,ncol(data)]))
  if (outcome.has.na) {
    stop("Error: The last column contains NA values. Please ensure that the outcome is positioned in the last column; NA is not permitted in the outcome.")
  }

  # Parameter validation
  if (!is.numeric(alpha.ind) || alpha.ind < 0 || alpha.ind > 1) {
    stop("Error: 'alpha.ind' must be numeric and between 0 and 1.")
  }

  if (!is.numeric(alpha) || alpha < 0 || alpha > 1) {
    stop("Error: 'alpha' must be numeric and between 0 and 1.")
  }

  if (!is.logical(intermediate.steps) || length(intermediate.steps) != 1) {
    stop("Error: 'intermediate.steps' must be either TRUE or FALSE.")
  }

  if (!is.numeric(kappa.star.cap) || kappa.star.cap < 0 || kappa.star.cap > 1) {
    stop("Error: 'kappa.star.cap' must be numeric and between 0 and 1.")
  }

  if (!is.numeric(NumOfComb) || NumOfComb < 1) {
    stop("Error: 'NumOfComb' must be a positive integer.")
  }

  data <- processNA(data, NA.handle)

  # If NumOfVar is not provided, use CASMI.selectFeatures
  if (is.null(NumOfVar)) {
    return(
      resultsSelect(
        data = data,
        NA_handle = NA.handle,
        alpha_ind = alpha.ind,
        alpha = alpha,
        intermediate_steps = intermediate.steps,
        kappa_star_cap = kappa.star.cap
      )
    )
  } else {
    if (!is.numeric(NumOfVar)) {
      stop("Error: 'NumOfVar' must be a number.")
    }

    NumOfVar <- as.integer(NumOfVar)


    return(
      resultsComb(data, NumOfVar, alpha, alpha.ind, intermediate.steps, NumOfComb)
    )
  }
}