R/FeatureSelection.R
In CASMI: 'CASMI'-Based Functions

Documented in CASMI.selectFeatures

generateResults <-
  function(data, indexCASMI, kappaStarCASMI, alpha) {
    # return a set of results based on indexCASMI

    nameCASMI = names(data[indexCASMI])

    # 1. SMIziCASMI: standardized mutual information (with z estimator, based on H(Y)) of each selected feature. Not cumulative.
    # 2. PziCASMI: p-value of mutual information test (with z estimator) for each selected feature. (to test independence, prefer large n)
    SMIziCASMI = vector()
    SMIziCASMI.CI = vector()
    PziCASMI = vector()
    for (i in 1:length(indexCASMI)) {
      XYtable = table(data[,indexCASMI[i]], data[,ncol(data)]) # XYtable handled NA automatically
      Ymargin = colSums(XYtable)

      mi_z = MI.z(XYtable) # EntropyEstimation package
      smi_z = mi_z / Entropy.z(Ymargin)
      SMIziCASMI = c(SMIziCASMI, smi_z)

      low = smi_z - E.CI.smi(XYtable, alpha)
      uppr = smi_z + E.CI.smi(XYtable, alpha)
      SMIziCASMI.CI <- rbind(SMIziCASMI.CI, c(low, uppr))

      n = sum(XYtable)
      K1 = nrow(XYtable)
      K2 = ncol(XYtable)

      PziCASMI = c(PziCASMI, pchisq(
        2 * n * mi_z + (K1 - 1) * (K2 - 1),
        df = (K1 - 1) * (K2 - 1),
        lower.tail = FALSE
      )) # same as MIz.test()
    }



    #######

    sample_size <- apply(data[indexCASMI], 2, function(col) sum(!is.na(col)))


    df <-
      data.frame(
        indexCASMI,
        sample_size,
        round(kappaStarCASMI, 4),
        round(SMIziCASMI, 4),
        round(SMIziCASMI.CI, 4),
        sprintf("%.4f", round(PziCASMI, 4)),
        nameCASMI,
        row.names = NULL
      )

    colnames(df) <-
      c(
        "Var.Idx",
        "n",
        "cml.kappa*",
        "SMIz",
        "SMIz.Low",
        "SMIz.Upr",
        "p.MIz",
        "Var.Name"
      )



    ### overall CASMI score for all selected features
    kappa.star.hat <- Kappas(data, indexCASMI)$kappa.star
    tmpIndexFnO = c(indexCASMI, ncol(data)) # Index of features and outcome
    ftable = ftable(table(data[tmpIndexFnO]))
    tmpXYtable <- ftable2xytable(ftable)
    E <- E.CI.smi(tmpXYtable, alpha)
    low = kappa.star.hat - E
    low = round(low, 6)
    uppr = kappa.star.hat + E
    uppr = round(uppr, 6)
    ci.kappa.star.hat = c(low, uppr)

    kappa.star.hat = round(kappa.star.hat, 6)
    names(kappa.star.hat) = c("kappa* for all selected features")


    names(ci.kappa.star.hat) = c("kappa* CI lower", "upper")


    if (kappa.star.hat != round(kappaStarCASMI[length(kappaStarCASMI)],6)) {
      warning(
        "Mismatched kappa* values detected. There may be an issue with the data or the function. Please report the issue to the package developer after ensuring that the data has been properly preprocessed."
      )

    }


    ##### return

    return(
      list(
        Outcome = names(data[ncol(data)]),
        Conf.Level = 1 - alpha,
        KappaStar = kappa.star.hat,
        KappaStarCI = ci.kappa.star.hat,
        Results = df
      )
    )

  }

# # Function to check for long values
# check_length <- function(column) {
#   column = na.omit(column)
#   return(any(nchar(as.character(column)) > 100))
# }

#' \pkg{CASMI} Feature Selection
#'
#' Selects features that are associated with an outcome while taking into account a sample coverage penalty and feature redundancy. It automatically determines the number of features to be selected, and the chosen features are ranked. (Synonyms for "feature" in this document: "independent variable," "factor," and "predictor.") \cr
#' For additional information, please refer to the publication: Shi, J., Zhang, J. and Ge, Y. (2019), "\pkg{CASMI}—An Entropic Feature Selection Method in Turing’s Perspective" <doi:10.3390/e21121179>
#' @param data data frame with variables as columns and observations as rows. The data MUST include at least one feature (a.k.a., independent variable, predictor, factor) and only one outcome variable (Y). The outcome variable MUST BE THE LAST COLUMN. Both the features and the outcome MUST be categorical or discrete. If variables are not naturally discrete, you may preprocess them using the `autoBin.binary()` function in the same package.
#' @param NA.handle options for handling NA values in the data. There are three options: `NA.handle = "stepwise"` (default), `NA.handle = "na.omit"`, and `NA.handle = "NA as a category"`. (1) `NA.handle = "stepwise"` excludes NA rows only when a particular variable is being used in a sub-step. For example, suppose we have data (Feature1: A, NA, B; Feature2: C, D, E; Feature3: F, G, H; Outcome: O, P, Q); the second observation will be excluded only when a particular step includes Feature1, but will not be excluded when a step is analyzing only Feature2, Feature3, and the Outcome. This option is designed to take advantage of the maximum possible number of observations. (2) `NA.handle = "na.omit"` excludes observations with any NA values at the beginning of the analysis. (3) `NA.handle = "NA as a category"` treats the NA value as a new category. This is designed to be used when NA values in the data have a consistent meaning instead of being missing values. For example, in survey data asking for comments, each NA value might consistently mean "no opinion."
#' @param alpha level of significance for the confidence intervals in final results. By default, `alpha = 0.05`.
#' @param alpha.ind level of significance for the mutual information test of independence in step 1 of the features selection (for an initial screening). The smaller the `alpha.ind`, the fewer features are sent to step 2 (<doi:10.3390/e21121179>). By default, `alpha.ind = 0.1`.
#' @param intermediate.steps setting for outputting intermediate steps while awaiting the final results. There are two possible settings: `intermediate.steps = TRUE` or `intermediate.steps = FALSE`.
#' @param kappa.star.cap a threshold of `kappa*` for halting the feature selection process. The program will automatically terminate at the first feature whose cumulative `kappa*` value exceeds the `kappa.star.cap` threshold. By default, `kappa.star.cap = 1.0`, which is the maximum possible value. A lower value may result in fewer final features but reduced computing time.
#' @param feature.num.cap the maximum number of features to be selected. A lower value may result in fewer final features but less computing time.
#' @return `CASMI.selectFeatures()` returns the following components:
#' \itemize{
#'    \item \code{`Outcome`}: Name of the outcome variable (last column) in the input dataset.
#'    \item \code{`Conf.Level`}: Confidence level used for the results.
#'    \item \code{`KappaStar`}: The estimated `kappa*` of all selected features. A larger `kappa*` indicates that the selected features have a stronger association with the outcome.
#'    \item \code{`KappaStarCI`}: The confidence interval of `kappa*` for all selected features.
#'    \item \code{`Results`}: A results data frame. The selected features are ranked.
#'    \item \code{`Var.Idx`}: Column index of the selected feature.
#'    \item \code{`n`}: Number of observations used in the analysis.
#'    \item \code{`cml.kappa*`}: The estimated cumulative `kappa*` score when this particular feature was added to the list. That is, the `kappa*` score of all currently selected features.
#'    \item \code{`SMIz`}: The Standardized Mutual Information (SMI) (using the z-estimator) between this particular feature and the outcome.
#'    \item \code{`SMIz.Low`}: Lower bound of the confidence interval for `SMIz`.
#'    \item \code{`SMIz.Upr`}: Upper bound of the confidence interval for `SMIz`.
#'    \item \code{`p.MIz`}: P-value between this particular feature and the outcome using the mutual information test of independence based on the z-estimator.
#'    \item \code{`Var.Name`}: Column name of the selected feature.
#' }
#'
#' @examples
#' # ---- Generate a toy dataset for usage examples: "data" ----
#' set.seed(123)
#' n <- 200
#' x1 <- sample(c("A", "B", "C", "D"), size = n, replace = TRUE, prob = c(0.1, 0.2, 0.3, 0.4))
#' x2 <- sample(c("W", "X", "Y", "Z"), size = n, replace = TRUE, prob = c(0.4, 0.3, 0.2, 0.1))
#' x3 <- sample(c("E", "F", "G", "H", "I"), size = n,
#'              replace = TRUE, prob = c(0.2, 0.3, 0.2, 0.2, 0.1))
#' x4 <- sample(c("A", "B", "C", "D"), size = n, replace = TRUE)
#' x5 <- sample(c("L", "M", "N"), size = n, replace = TRUE)
#' x6 <- sample(c("E", "F", "G", "H", "I"), size = n, replace = TRUE)
#'
#' # Generate y variable dependent on x1 to x3
#' x1_num <- as.numeric(factor(x1, levels = c("A", "B", "C", "D")))
#' x2_num <- as.numeric(factor(x2, levels = c("W", "X", "Y", "Z")))
#' x3_num <- as.numeric(factor(x3, levels = c("E", "F", "G", "H", "I")))
#' # Calculate y with added noise
#' y_numeric <- 3*x1_num + 2*x2_num - 2*x3_num + rnorm(n,mean=0,sd=2)
#' # Discretize y into categories
#' y <- cut(y_numeric, breaks = 10, labels = paste0("Category", 1:10))
#'
#' # Combine into a dataframe
#' data <- data.frame(x1, x2, x3, x4, x5, x6, y)
#'
#' # The outcome of the toy dataset is dependent on x1, x2, and x3
#' # but is independent of x4, x5, and x6.
#' head(data)
#'
#'
#' # ---- Usage Examples ----
#'
#' ## Select features and provide relevant results:
#' CASMI.selectFeatures(data)
#'
#' ## Adjust 'feature.num.cap' for including fewer features:
#' ## (Note: A lower 'feature.num.cap' value may result in fewer
#' ## final features but less computing time.)
#' CASMI.selectFeatures(data, feature.num.cap = 2)
#'
#'
#' @importFrom EntropyEstimation Entropy.z MI.z
#' @importFrom entropy entropy.plugin mi.plugin
#' @importFrom stats pchisq qnorm
#'
#' @export



CASMI.selectFeatures <- function(data,  # outcome must be in the last column
                                 NA.handle = "stepwise",
                                 alpha = 0.05,
                                 alpha.ind = 0.1,
                                 intermediate.steps = FALSE,
                                 kappa.star.cap = 1.0,
                                 feature.num.cap = ncol(data)) {

  # check data type
  if (!is.data.frame(data)) {
    stop("Error: The input is not of the data frame type.")
  }


  data <- as.data.frame(data)



  outcome.has.na <- any(is.na(data[,ncol(data)]))
  if (outcome.has.na) {
    stop("Error: The last column contains NA values. Please ensure that the outcome is placed in the last column. NA values are not permitted in the outcome.")
  }


  # NA handling
  if(NA.handle == "stepwise") {
    # do nothing
  }else if(NA.handle == "na.omit") {
    data <- na.omit(data)
  }else if(NA.handle == "NA as a category") {
    data[] <- apply(data, 2, function(column) {
      ifelse(is.na(column), "=na=", column)
    })
    data <- as.data.frame(data)
  }else {
    stop("Error: Invalid 'NA.handle' value. Please check the document.")
  }

  if (!is.numeric(alpha.ind)) {
    stop("Error: 'alpha.ind' must be numeric.")
  } else if (alpha.ind < 0 || alpha.ind > 1) {
    stop("Error: 'alpha.ind' must be between 0 and 1.")
  }

  if (!is.numeric(alpha)) {
    stop("Error: 'alpha' must be numeric.")
  } else if (alpha < 0 || alpha > 1) {
    stop("Error: 'alpha' must be between 0 and 1.")
  }

  if (!is.logical(intermediate.steps) || length(intermediate.steps) != 1) {
    stop("Error: 'intermediate.steps' must be either TRUE or FALSE.")
  }

  if (!is.numeric(kappa.star.cap)) {
    stop("Error: 'kappa.star.cap' must be numeric.")
  } else if (kappa.star.cap < 0 || kappa.star.cap > 1) {
    stop("Error: 'kappa.star.cap' must be between 0 and 1.")
  }

  if (!is.numeric(feature.num.cap)) {
    stop("Error: 'feature.num.cap' must be numeric.")
  } else if (feature.num.cap < 1) {
    stop("Error: 'feature.num.cap' must be at least 1.")
  }


  start_time <- proc.time()


  # Step 1: return dependent features, step1Index[]

  step1Index = vector()
  count = 0
  for (fi in 1:(ncol(data) - 1)) {
    p_value = MIz.test(data[,fi], data[,ncol(data)])
    if (p_value <= alpha.ind) {
      count = count + 1
      step1Index[count] = fi
    }
  }

  indexCASMI = vector()
  kappaStarCASMI = vector()

  if (count == 0) {
    warning(
      "No features were identified as being associated with the outcome. Please ensure that the outcome variable is located in the last column and that the data has been appropriately preprocessed."
    )

    indexCASMI = NULL

  } else{

    if(intermediate.steps) {
      cat(count, "feature(s) passed Step 1 (a test of independence).\n")
    }


    # Step 2: select features by joint SMI with Hz estimator

    # select the best feature (the first feature)
    maxKappaStar = 0
    maxIndex = 0
    for (index in step1Index) {
      valueKappaStar = Kappas(data, index)$kappa.star

      if (valueKappaStar > maxKappaStar) {
        maxKappaStar = valueKappaStar
        maxIndex = index
      }
    }
    indexCASMI = c(indexCASMI, maxIndex)
    kappaStarCASMI = c(kappaStarCASMI, maxKappaStar)

    if(intermediate.steps) {
      intermediateCount = 1
      cat("Selected Feature ", intermediateCount, " - selected column (idx.):", maxIndex, ", cml.kappa*:", maxKappaStar, ", elapsed (sec.):", (proc.time() - start_time)["elapsed"], "\n")
    }

    if (maxKappaStar >= kappa.star.cap) {
      warning(
        "The feature selection process was automatically terminated as cml.kappa* of the currently selected features reached the 'kappa.star.cap' value."
      )
    }
    if (length(indexCASMI) >= floor(feature.num.cap)) {
      warning(
        "The feature selection process was automatically terminated as the number of selected features reached the 'feature.num.cap' value."
      )
    }

    if (length(step1Index) == 1) {
      return(generateResults(data, indexCASMI, kappaStarCASMI, alpha))
    }



    # select the 2nd, 3rd, ... best features by joint
    maxmaxKappaStar = 0
    while (maxKappaStar > maxmaxKappaStar &
           length(indexCASMI) < count &
           maxKappaStar < kappa.star.cap &
           length(indexCASMI) < floor(feature.num.cap)) {

      maxmaxKappaStar = maxKappaStar

      step1Index = step1Index[!step1Index == maxIndex]
      maxKappaStar = 0
      maxIndex = 0
      for (index in step1Index) {
        tmpIndexFeatures = c(indexCASMI, index) # Index of selected features + the new feature

        valueKappaStar = Kappas(data, tmpIndexFeatures)$kappa.star

        if (valueKappaStar > maxKappaStar) {
          maxKappaStar = valueKappaStar
          maxIndex = index
        }
      }

      if (maxKappaStar > maxmaxKappaStar + 10 ^ -14) { # +10^-14 is to solve the problem of precision
        indexCASMI = c(indexCASMI, maxIndex)
        kappaStarCASMI = c(kappaStarCASMI, maxKappaStar)

        if(intermediate.steps) {
          intermediateCount = intermediateCount + 1
          cat("Selected Feature ", intermediateCount, " - selected column (idx.):", maxIndex, ", cml.kappa*:", maxKappaStar, ", elapsed (sec.):", (proc.time() - start_time)["elapsed"], "\n")
        }
      }

      if (maxKappaStar >= kappa.star.cap) {
        warning(
          "The feature selection process was automatically terminated as cml.kappa* of the currently selected features reached the 'kappa.star.cap' value."
        )
      }

      if (length(indexCASMI) >= floor(feature.num.cap)) {
        warning(
          "The feature selection process was automatically terminated as the number of selected features reached the 'feature.num.cap' value."
        )
      }
    }

    if(intermediate.steps) {
      cat("---End of intermediate process.---\n")
      cat("Generating summary...\n\n")
    }

    list = generateResults(data, indexCASMI, kappaStarCASMI, alpha)

    return(list)
  }
}