R/utils.R

Defines functions saferSVD wilson_hilferty getChiSqStatHelper getChiSqStat getDesignMatrix getLDscores checkPriorAndMisClassCost getMode getNumFlag getDownSampleInd nonConstInd getDataInShape missingFix

Documented in checkPriorAndMisClassCost getChiSqStat getDataInShape getMode missingFix

#' Impute Missing Values and Add Missing Flags to a Data Frame
#'
#' This function imputes missing values in a data frame based on specified
#' methods for numerical and categorical variables. Additionally, it can add
#' flag columns to indicate missing values. For numerical variables, missing
#' values can be imputed using the mean or median. For categorical variables,
#' missing values can be imputed using the mode or a new level. This function
#' also removes constant columns (all NAs or all observed but the same value).
#'
#' @param data A data frame containing the data to be processed. Missing values
#'   (`NA`) will be imputed based on the methods provided in `missingMethod`.
#' @param missingMethod A character vector of length 2 specifying the methods
#'   for imputing missing values. The first element specifies the method for
#'   numerical variables (`"mean"`, `"median"`, `"meanFlag"`, or
#'   `"medianFlag"`), and the second element specifies the method for
#'   categorical variables (`"mode"`, `"modeFlag"`, or `"newLevel"`). If
#'   `"Flag"` is included, a flag column will be added for the corresponding
#'   variable type.
#'
#' @return A list with two elements: \item{data}{The original data frame with
#'   missing values imputed, and flag columns added if applicable.} \item{ref}{A
#'   reference row containing the imputed values and flag levels, which can be
#'   used for future predictions or reference.}
#'
#' @export
#'
#' @examples
#' dat <- data.frame(
#'   X1 = rep(NA, 5),
#'   X2 = factor(rep(NA, 5), levels = LETTERS[1:3]),
#'   X3 = 1:5,
#'   X4 = LETTERS[1:5],
#'   X5 = c(NA, 2, 3, 10, NA),
#'   X6 = factor(c("A", NA, NA, "B", "B"), levels = LETTERS[1:3])
#' )
#' missingFix(dat)
missingFix <- function(data, missingMethod = c("medianFlag", "newLevel")){

  # Preprocessing: Argument #
  if (length(missingMethod) != 2) stop("missingMethod must be a vector of length 2")
  numMethod <- match.arg(missingMethod[1], c("mean", "median", "meanFlag", "medianFlag"))
  catMethod <- match.arg(missingMethod[2], c("mode", "modeFlag", "newLevel"))

  # remove constant columns
  data <- data[, nonConstInd(data = data), drop = FALSE]
  if(any(dim(data) == 0)) return(list(data = data, ref = data)) # all columns are constant

  # Add Missing Columns #
  numOrNot <- getNumFlag(data); NAorNot <- sapply(data, anyNA)

  createFlagColumns <- function(addFlag, targetBool){
    if(addFlag & sum(targetBool) > 0){
      NAcols <- which(targetBool & NAorNot)
      if(length(NAcols) > 0){
        flagCols <- is.na(data[, NAcols, drop = FALSE]) + 0
        colnames(flagCols) <- paste(colnames(flagCols), "FLAG", sep = "_")
        data <- cbind(data, flagCols)
      }
    }
    return(data)
  }
  data <- createFlagColumns(addFlag = grepl("Flag", numMethod), targetBool = numOrNot)
  data <- createFlagColumns(addFlag = grepl("Flag", catMethod), targetBool = !numOrNot)

  # Imputation Starts #
  dataNRef <- rbind(data, NA) # add ref to the last row of data, initialize using NAs
  dataNRef[nrow(dataNRef), seq_len(ncol(dataNRef) - length(numOrNot)) + length(numOrNot)] <- 1 # flags

  for(i in seq_along(numOrNot)){
    #> This for loop is inevitable since we have to add factor levels to ref,
    #> and add the new level to the original data
    if(numOrNot[i]){ # numerical variables
      targetValue <- do.call(ifelse(grepl("mean", numMethod), "mean", "median"),
                             list(dataNRef[,i], na.rm = TRUE))
    } else{ # categorical vars

      if(is.factor(dataNRef[,i])){
        currentLevels <- levels(droplevels(dataNRef[,i])) # drop extra levels
      }else{
        currentLevels <- sort(unique(stats::na.omit(dataNRef[,i])))
      }

      if(grepl("newLevel", catMethod) & NAorNot[i]){
        currentLevels <- c(currentLevels, "new0_0Level")
        targetValue <- "new0_0Level"
      } else targetValue <- getMode(dataNRef[,i])

      dataNRef[, i] <- factor(dataNRef[,i], levels = currentLevels)
    }
    dataNRef[is.na(dataNRef[,i]), i] <- targetValue
  }

  # remove constant columns after imputation
  dataNRef <- dataNRef[, nonConstInd(data = dataNRef), drop = FALSE]
  missingRef <- dataNRef[nrow(dataNRef),,drop = FALSE]
  rownames(missingRef) <- NULL

  return(list(data = dataNRef[-nrow(dataNRef),,drop = FALSE],
              ref = missingRef))
}


#' Align Data with a Missing Reference
#'
#' This function aligns a given dataset (`data`) with a reference dataset
#' (`missingReference`). It ensures that the structure, column names, and factor
#' levels in `data` match the structure of `missingReference`. If necessary,
#' missing columns are initialized with `NA`, and factor levels are adjusted to
#' match the reference. Additionally, it handles the imputation of missing
#' values based on the reference and manages flag variables for categorical or
#' numerical columns.
#'
#' @param data A data frame to be aligned and adjusted according to the
#'   `missingReference`.
#' @param missingReference A reference data frame that provides the structure
#'   (column names, factor levels, and missing value reference) for aligning
#'   `data`.
#'
#' @return A data frame where the structure, column names, and factor levels of
#'   `data` are aligned with `missingReference`. Missing values in `data` are
#'   imputed based on the first row of the `missingReference`, and flag
#'   variables are updated accordingly.
#'
#' @export
#'
#' @examples
#' data <- data.frame(
#'   X1_FLAG = c(0, 0, 0),
#'   X1 = factor(c(NA, "C", "B"), levels = LETTERS[2:3]),
#'   X2_FLAG = c(NA, 0, 1),
#'   X2 = c(2, NA, 3)
#' )
#'
#' missingReference <- data.frame(
#'   X1_FLAG = 1,
#'   X1 = factor("A", levels = LETTERS[1:2]),
#'   X2 = 1,
#'   X2_FLAG = 1
#' )
#'
#' getDataInShape(data, missingReference)
getDataInShape <- function(data, missingReference){
  cname <- colnames(missingReference)
  refIdx <- match(cname, colnames(data))
  if(anyNA(refIdx)){ # initialize with NA if missing columns
    data[, cname[which(is.na(refIdx))]] <- NA
    refIdx <- match(cname, colnames(data))
  }

  for(currentIdx in seq_len(ncol(missingReference))){ # Main program starts
    #> The tricky part is the iterator is based on the missingReference, NOT the data.
    dataIdx <- refIdx[currentIdx]; currentVarName <- cname[currentIdx]

    ### New-level Fix for Categorical Variable ###
    numOrNot <- getNumFlag(missingReference[, currentIdx])
    if(!numOrNot & !identical(levels(data[, dataIdx]), levels(missingReference[, currentIdx])))
      data[, dataIdx] <- factor(data[, dataIdx], levels = levels(missingReference[, currentIdx]))

    missingOrNot <- is.na(data[, dataIdx])
    if(!any(missingOrNot)) next

    ## Scenario 1: It has a related flag variable in the data ##
    #> Only modify those flags where the original variable is missing
    #> Keep other parts still, since there could already be imputed values
    #> in the original variable that have been taken care of
    currentFlagIdx <- which(cname == paste(currentVarName,"FLAG",sep = "_"))
    if(length(currentFlagIdx) == 1) data[which(missingOrNot), refIdx[currentFlagIdx]] <- 1

    ## Scenario 2: It is a flag and it has an original variable in (or not in) the data ##
    #> Only impute those NAs in the flag, but keep the values that are already in the flag
    if(grepl("_FLAG$", currentVarName)){
      orginalVarIdx <- which(cname == sub("_FLAG$", "", currentVarName))
      if(length(orginalVarIdx) == 1){
        data[which(missingOrNot), dataIdx] <- is.na(data[which(missingOrNot), refIdx[orginalVarIdx]]) + 0
      } else data[which(missingOrNot), dataIdx] <- 1 # The original data is NOT found
      next
    }

    ### For numerical & categorical variables ###
    data[which(missingOrNot), dataIdx] <- missingReference[1, currentIdx]
  }

  return(data[, refIdx, drop = FALSE])
}


#' Identify Non-Constant Columns in a Data Frame
#'
#' @noRd
#'
#' @param data A data frame in which columns will be checked for constant
#'   values. Columns can be of any type (numeric, integer, logical, or factor).
#' @param tol A numeric tolerance value (default is `1e-8`) that applies to
#'   numerical columns.
#' @param na.rm A logical value. If `FALSE` (default), missing values are
#'   retained during the check.
#'
#' @return An integer vector containing the indices of the non-constant columns
#'   in the data frame. If all columns are constant, an empty vector is
#'   returned.
nonConstInd <- function(data, tol = 1e-8, na.rm = FALSE){

  if (!is.data.frame(data)) stop("data must be a data.frame")

  nonConstIndHelper <- function(x, tol){
    if(getNumFlag(x)) x <- round(x, digits = -log(tol, base = 10))
    if(na.rm) x <- stats::na.omit(x)
    return(length(unique(x)) > 1)
  }

  idxNotConst <- which(sapply(data, function(x) nonConstIndHelper(x = x, tol = tol)))

  return(idxNotConst)
}


#' Helper Function to Generate Training Set Indices Through Downsampling
#'
#' This function selects the indices for the training set based on the class
#' vector `response`. It allows for optional downsampling to balance the class
#' distribution by limiting the number of samples per class.
#'
#' @noRd
#'
#' @param response A factor vector representing the class labels.
#' @param downSampling A logical value indicating whether downsampling should be
#'   applied. If `TRUE`, downsampling is performed to limit the number of
#'   samples per class based on `kSample`. Note that this may not result in
#'   equal class frequencies, as `kSample` defines an upper limit for each
#'   class, not a lower limit.
#' @param kSample An integer specifying the maximum number of samples to be
#'   selected per class. If `NULL`, the number of samples is limited to the size
#'   of the smallest class.
#'
#' @return An integer vector of indices representing the selected samples from
#'   the original `response` vector.
getDownSampleInd <- function(response,
                             downSampling = FALSE,
                             kSample = NULL){
  if (!is.logical(downSampling) || length(downSampling) != 1) stop("downSampling must be either TRUE or FALSE")

  idxSelected <- which(!is.na(response)); response <- droplevels(response[idxSelected])

  if(downSampling){
    idxForEachClass <- split(seq_along(response), response)
    resTable <- sapply(idxForEachClass, length)
    if(is.null(kSample)) kSample <- min(resTable)
    nForEachClass <- pmin(resTable, kSample)
    finalIdx <- do.call(c, lapply(seq_along(resTable), function(i) idxForEachClass[[i]][sample(resTable[i], nForEachClass[i])]))
    idxSelected <- idxSelected[finalIdx]
  }

  return(idxSelected)
}


#' Identify Numeric, Integer, or Logical Columns in a Data Frame
#'
#' This function checks whether the columns in a data frame (or a vector) are of
#' type numeric, integer, or logical. It can return a logical vector indicating
#' whether each column matches these types, or, if `index = TRUE`, it returns
#' the indices of the matching columns.
#'
#' @noRd
#'
#' @param data A data frame or a vector. The function will check the data types
#'   of the columns (if `data` is a data frame) or the type of the vector.
#' @param index A logical value. If `FALSE` (default), the function returns a
#'   logical vector indicating which columns are numeric, integer, or logical.
#'   If `TRUE`, it returns the indices of these columns.
#' @return If `index = FALSE` (default), the function returns a logical vector
#'   with one element for each column (or the vector itself), where `TRUE`
#'   indicates that the column is of type numeric, integer, or logical, and
#'   `FALSE` indicates it is not. If `index = TRUE`, the function returns an
#'   integer vector containing the indices of the columns that are numeric,
#'   integer, or logical.
getNumFlag <- function(data, index = FALSE){
  #> This function should be replaced (or deleted) with caution,
  #> since the function in sapply framework output wrong classes.
  if(is.null(dim(data))) return(inherits(data, c("numeric", "integer", "logical")))

  if (!is.data.frame(data)) stop("data must be a data.frame")

  numOrNot <- sapply(data, function(col) inherits(col, c("numeric", "integer", "logical")))

  if(index){numOrNot <- which(numOrNot)}

  return(numOrNot)
}


#' Calculate the Mode of a Factor Variable with Optional Priors
#'
#' This function calculates the mode of a given factor or vector that can be
#' coerced into a factor. You can optionally provide prior weights for each
#' level of the factor.
#'
#' @param v A factor or vector that can be coerced into a factor. The mode will
#'   be calculated from the levels of this factor.
#' @param prior A numeric vector of prior weights for each level of the factor.
#'   If not provided, all levels will be given equal weight.
#'
#' @return The mode of the factor `v` as a character string. If all values are
#'   `NA`, the function returns `NA`.
#'
#' @export
#'
#' @examples
#' # Example 1: Mode without priors
#' v <- factor(c("apple", "banana", "apple", "orange", NA))
#' getMode(v)
#'
#' # Example 2: Mode with priors
#' v <- factor(c("apple", "banana", "apple", "orange", NA))
#' prior <- c(apple = 0.5, banana = 1.5, orange = 1)
#' getMode(v, prior)
getMode <- function(v, prior){
  #> NA will be ignored
  v <- as.factor(v)
  if(missing(prior)){
    prior = rep(1, nlevels(v)) # equal prior
  }else{
    if (is.null(names(prior))){
      stopifnot(length(prior) == nlevels(v))
      names(prior) <- levels(v)
    } else prior <- prior[match(levels(v), names(prior))]
  }

  summary_table <- table(v) * prior
  if(length(summary_table) == 0 | sum(summary_table) == 0) return(NA) # all NA
  return(names(which.max(summary_table)))
}


#' Check and Normalize Prior Probabilities and Misclassification Costs
#'
#' This function verifies and normalizes the provided prior probabilities and
#' misclassification cost matrix for a given response variable. It ensures that
#' the lengths of the prior and the dimensions of the misclassification cost
#' matrix match the number of levels in the response variable. If `prior` or
#' `misClassCost` are not provided, default values are used: the prior is set to
#' the observed frequencies of the response, and the misclassification cost
#' matrix is set to 1 for all misclassifications and 0 for correct
#' classifications.
#'
#' @param prior A numeric vector representing the prior probabilities for each
#'   class in the response variable. If `NULL`, the observed frequencies of the
#'   response are used as the default prior.
#' @param misClassCost A square matrix representing the misclassification costs
#'   for each pair of classes in the response variable. If `NULL`, a default
#'   misclassification matrix is created where all misclassifications have a
#'   cost of 1 and correct classifications have a cost of 0.
#' @param response A factor representing the response variable with multiple
#'   classes.
#'
#' @return A list containing: \item{prior}{A normalized vector of prior
#'   probabilities for each class.} \item{misClassCost}{A square matrix
#'   representing the misclassification costs, with rows and columns labeled by
#'   the levels of the response variable.}
#'
#' @export
#'
#' @examples
#' # Example 1: Using default prior and misClassCost
#' response <- factor(c('A', 'B', 'A', 'B', 'C', 'A'))
#' checkPriorAndMisClassCost(NULL, NULL, response)
#'
#' # Example 2: Providing custom prior and misClassCost
#' prior <- c(A = 1, B = 1, C = 2)
#' misClassCost <- matrix(c(0, 2, 10,
#'                          1, 0, 10,
#'                          1, 2, 0), nrow = 3, byrow = TRUE)
#' checkPriorAndMisClassCost(prior, misClassCost, response)
checkPriorAndMisClassCost <- function(prior, misClassCost, response){

  matchWrapper <- function(nameObj, nameTarget){
    if (length(nameObj) != length(nameTarget))
      stop("Length mismatch: expected ", length(nameTarget), " names but got ", length(nameObj))
    targetIndex <- match(nameTarget, nameObj)
    if (anyNA(targetIndex))
      stop("Names in '", paste(nameTarget[is.na(targetIndex)], collapse = ", "), "' do not match with response levels.")
    return(targetIndex)
  }

  freqObs <- table(response, dnn = NULL) / length(response) # Default: Estimated Prior

  if (is.null(prior)) { # prior fix
    prior <- as.vector(freqObs)
  } else {
    if (!is.numeric(prior) || any(prior < 0)) stop("prior must be non-negative numbers")
    if (length(prior) != nlevels(response))
      stop("The length of 'prior' must be equal to the number of response levels (", nlevels(response), ").")
    if (!is.null(names(prior))) prior <- prior[matchWrapper(names(prior), levels(response))]
    prior <- as.vector(prior) / sum(prior)
  }

  if (is.null(misClassCost)){ # fix misClassCost fix
    misClassCost <- matrix(1, nrow = nlevels(response), ncol = nlevels(response)) - diag(1, nrow = nlevels(response))
  } else{
    if (!is.matrix(misClassCost) || !getNumFlag(as.vector(misClassCost)))
      stop("'misClassCost' must be a numeric matrix.")

    if(length(unique(c(dim(misClassCost), nlevels(response)))) != 1)
      stop("misclassification costs matrix has wrong dimension")

    if(!all.equal(colnames(misClassCost), rownames(misClassCost)))
      stop("misClassCost: colnames should be the same as rownames")

    if (!is.null(colnames(misClassCost))){
      misClassCost <- misClassCost[matchWrapper(colnames(misClassCost), levels(response)),
                                   matchWrapper(colnames(misClassCost), levels(response))]
    }
  }

  names(prior) <- colnames(misClassCost) <- rownames(misClassCost) <- levels(response)
  return(list(prior = prior, misClassCost = misClassCost))
}


#' Compute Linear Discriminant Scores
#'
#' @noRd
#'
#' @param modelULDA A fitted LDA model object containing the scaling matrix and
#'   the reference structure for missing data.
#' @param data A data frame containing the predictor variables for which to
#'   compute the linear discriminant scores.
#' @param nScores An integer specifying the number of discriminant scores to
#'   compute. If `-1` (default), all scores are computed.
#' @return A matrix of linear discriminant scores, where rows correspond to
#'   observations and columns correspond to the computed discriminant scores.
#'   If `nScores > 0`, only the specified number of scores is returned; otherwise,
#'   all scores are computed and returned.
getLDscores <- function(modelULDA, data, nScores = -1){
  data <- getDataInShape(data = data, missingReference = modelULDA$misReference)
  modelX <- getDesignMatrix(modelULDA = modelULDA, data = data, scale = TRUE)
  if(nScores > 0){
    nScores <- min(nScores, ncol(modelULDA$scaling))
    modelULDA$scaling <- modelULDA$scaling[, seq_len(nScores), drop = FALSE]
  }
  LDscores <- modelX %*% modelULDA$scaling
  return(LDscores)
}


#' Generate the Design Matrix for LDA Model
#'
#' @noRd
#'
#' @param modelULDA A fitted LDA model object containing the terms, variable
#'   indices, variable centers, and scaling factors.
#' @param data A data frame containing the predictor variables that are used to
#'   create the design matrix.
#' @param scale A logical value indicating whether to scale the design matrix
#'   based on the mean and standard deviation of the variables (default is
#'   `FALSE`).
#' @return A design matrix where each row corresponds to an observation and
#'   each column to a predictor variable. If `scale = TRUE`, the variables are
#'   centered and scaled based on the means and standard deviations provided in
#'   the LDA model object.
getDesignMatrix <- function(modelULDA, data, scale = FALSE){
  Terms <- stats::delete.response(modelULDA$terms)
  modelX <- stats::model.matrix(Terms, data = data, xlev = modelULDA$xlevels)
  if(scale){ # Reserved for internal usage from getLDscores
    modelX <- sweep(modelX[, modelULDA$varIdx, drop = FALSE], 2, modelULDA$varCenter, "-")
    modelX <- sweep(modelX, 2, modelULDA$varSD, "/")
  }
  return(modelX)
}


#' Compute Chi-Squared Statistics for Variables
#'
#' This function calculates the chi-squared statistic for each column of `datX`
#' against the response variable `response`. It supports both numerical and
#' categorical predictors in `datX`. For numerical variables, it automatically
#' discretizes them into factor levels based on standard deviations and mean,
#' using different splitting criteria depending on the sample size.
#'
#' @param datX A matrix or data frame containing predictor variables. It can
#'   consist of both numerical and categorical variables.
#' @param response A factor representing the class labels. It must have at least
#'   two levels for the chi-squared test to be applicable.
#'
#' @return A vector of chi-squared statistics, one for each predictor variable
#'   in `datX`. For numerical variables, the chi-squared statistic is computed
#'   after binning the variable.
#'
#' @details For each variable in `datX`, the function first checks if the
#'   variable is numerical. If so, it is discretized into factor levels using
#'   either two or three split points, depending on the sample size and the
#'   number of levels in the `response`. Missing values are handled by assigning
#'   them to a new factor level.
#'
#'   The chi-squared statistic is then computed between each predictor and the
#'   `response`. If the chi-squared test has more than one degree of freedom,
#'   the Wilson-Hilferty transformation is applied to adjust the statistic to a
#'   1-degree-of-freedom chi-squared distribution.
#'
#' @export
#'
#' @references Loh, W. Y. (2009). Improving the precision of classification
#' trees. \emph{The Annals of Applied Statistics}, 1710–1737. JSTOR.
#'
#' @examples
#' datX <- data.frame(var1 = rnorm(100), var2 = factor(sample(letters[1:3], 100, replace = TRUE)))
#' y <- factor(sample(c("A", "B"), 100, replace = TRUE))
#' getChiSqStat(datX, y)
getChiSqStat <- function(datX, response){
  y <- droplevels(as.factor(response))
  if(is.null(dim(datX))) return(getChiSqStatHelper(datX, y))

  return(apply(datX, 2, function(x) getChiSqStatHelper(x, y)))
}

getChiSqStatHelper <- function(x,y){
  if(getNumFlag(x)){ # numerical variable: first change to factor
    m = mean(x,na.rm = T); s = stats::sd(x,na.rm = T)
    if(sum(!is.na(x)) >= 30 * nlevels(y)){
      splitNow = c(m - s *sqrt(3)/2, m, m + s *sqrt(3)/2)
    }else splitNow = c(m - s *sqrt(3)/3, m + s *sqrt(3)/3)

    if(length(unique(splitNow)) == 1) return(0) # No possible split
    x = cut(x, breaks = c(-Inf, splitNow, Inf), right = TRUE)
  }

  if(anyNA(x)){
    levels(x) = c(levels(x), 'newLevel')
    x[is.na(x)] <- 'newLevel'
  }
  if(length(unique(x)) == 1) return(0) # No possible split

  fit <- suppressWarnings(stats::chisq.test(x, y))

  #> Change to 1-df wilson_hilferty chi-squared stat unless
  #> the original df = 1 and p-value is larger than 10^(-16)
  ans = unname(ifelse(fit$parameter > 1L, ifelse(fit$p.value > 10^(-16),
                                                 stats::qchisq(1-fit$p.value, df = 1),
                                                 wilson_hilferty(fit$statistic,fit$parameter)), fit$statistic))
  return(ans)
}


wilson_hilferty = function(chi, df){ # change df = K to df = 1
  ans = max(0, (7/9 + sqrt(df) * ( (chi / df) ^ (1/3) - 1 + 2 / (9 * df) ))^3)
  return(ans)
}


# For users that does not have the latest RcppEigen module ----------------


saferSVD <- function(x, uFlag = TRUE) {
  fitSVD <- tryCatch({
    if (uFlag) {
      svdEigen(x)
    } else svdEigen(x, uFlag = FALSE)
  }, error = function(e) {NULL})

  if (!is.null(fitSVD) && !anyNA(fitSVD$v)) return(fitSVD)

  # if svdEigen is NULL / NA in the fitSVD -> fallback to base::svd
  fitSVD <- tryCatch({
    if (uFlag) {
      svd(x)
    } else svd(x, nu = 0L)
  }, error = function(e) {stop("Both svd modules failed, check your input data.")})

  return(fitSVD)
}

Try the folda package in your browser

Any scripts or data that you put into this service are public.

folda documentation built on Oct. 30, 2024, 9:14 a.m.