R/input-validation.R

Defines functions check_if_solver_is_available input_validation_matching legal_number_of_clusters argument_exists self_validation validate_data_matrix validate_input input_validation_anticlustering

#' Validating the arguments passed to `anticlustering`
#' 
#' @return NULL
#'
#' @noRd
input_validation_anticlustering <- function(x, K, objective, method,
                                          preclustering, categories,
                                          repetitions, standardize = FALSE, cannot_link = NULL,
                                          must_link = NULL) {
  
  ## Validate feature input
  validate_data_matrix(x)
  x <- as.matrix(x)
  N <- nrow(x)
  
  if (argument_exists(must_link)) {
    validate_input(must_link, "must_link", not_function = TRUE, len = N)
    must_link <- as.matrix(must_link)

    validate_input(objective, "objective", input_set = c("diversity"), not_na = TRUE, len = 1) 
    validate_input(method, "method", input_set = c("local-maximum", "exchange", "2PML"), not_na = TRUE, len = 1) 
    
    if (ncol(must_link) != 1) {
      stop("Argument must_link must be a vector.")
    }
    if (objective %in% c("dispersion", "kplus", "variance")) {
      stop("Currently, must-link constraints only work with objective = 'diversity'.")
    }
    if (argument_exists(categories)) {
      stop("\nCombining the `categories` argument together with must-link constraints \n",
           "is currently not supported; use the categorical variables as part of the first argument\n",
           "`x` instead (see the vignette on categorical variables).")
    }
    if (isTRUE(preclustering)) {
      stop("It is not possible to combine preclustering with must-link constraints.")
    }
    if (argument_exists(cannot_link)) {
      stop("Currently, it is not possible to use both cannot-link and must-link constraints.")
    }
  }
  
  if (argument_exists(cannot_link)) {
    cannot_link <- as.matrix(cannot_link)
    validate_input(cannot_link, "cannot_link", 
                   objmode = "numeric", must_be_integer = TRUE, 
                   greater_than = 0, smaller_than = NROW(x)+1, 
                   not_na = TRUE, not_function = TRUE)
    if (ncol(cannot_link) != 2) {
      stop("Argument cannot_link must have 2 columns.")
    }
    if (objective == "dispersion") {
      stop("objective = dispersion does not work with cannot_link constraints.")
    }
    if (argument_exists(categories)) {
      stop("\nCombining the `categories` argument together with cannot-link constraints \n",
           "is currently not supported; use the categorical variables as part of the first argument\n",
           "`x` instead (see the vignette on categorical variables).")
    }
    if (isTRUE(preclustering)) {
      stop("It is not possible to combine preclustering with cannot-link constraints.")
    }
  }
  
  validate_input(standardize, "standardize", objmode = "logical", len = 1,
                 input_set = c(TRUE, FALSE), not_na = TRUE, not_function = TRUE)
  
  if (argument_exists(repetitions)) {
    validate_input(repetitions, "repetitions", objmode = "numeric", len = 1, 
                   greater_than = 0, must_be_integer = TRUE, not_na = TRUE,
                   not_function = TRUE)
  }
  
  ## Merge categories variable so that `length` can be applied:
  categories <- merge_into_one_variable(categories)
  
  validate_input(preclustering, "preclustering", len = 1,
                 input_set = c(TRUE, FALSE), not_na = TRUE, not_function = TRUE)

  validate_input(
    method, "method", len = 1,
    input_set = c("ilp", "exchange", "heuristic", "centroid", "local-maximum", "brusco", "2PML"), 
    not_na = TRUE, not_function = TRUE
  )
  if (method == "2PML") {
    if (!argument_exists(must_link)) {
      stop("Method 2PML only works with must-link constraints.")
    }
  }
  
  if (method == "brusco") {
    if (argument_exists(categories)) {
      stop("It is not possible to use the algorithm by Brusco et al. with categorical restrictions.")
    }
    if (preclustering == TRUE) {
      stop("It is not possible to use the algorithm by Brusco et al. with preclustering restrictions.")
    }
  }
  
                 
  # Allow that K is an initial assignment of elements to clusters
  validate_input(K, "K", objmode = "numeric", must_be_integer = TRUE, not_na = TRUE, not_function = TRUE)
  if (length(K) == 1) {
    validate_input(K, "K", greater_than = 1, smaller_than = N)
  } else {
    if (length(K) != N && sum(K) != N) {
      stop("Argument `K` is misspecified.")
    }
    if (method == "ilp") {
      stop("Pass the number of groups via argument `K` when using method = 'ilp'.")
    }
    if (preclustering == TRUE) {
      stop("Cannot only use preclustering when argument `K` is the number of groups.")
    }
    if (method == "ilp") {
      stop("The argument `K` should indicate the number of groups when using `method = 'ilp'`.")
    }
    if (argument_exists(categories)) {
      if (length(categories) != length(K) && sum(K) != N) {
        stop("Length of arguments `categories` and `K` differ.")
      }
    }
  }

  if (argument_exists(categories) && length(categories) != N) {
    stop("The length of the `categories` argument is not equal to the the number of input elements.")
  }

  if (length(K) == 1 && N %% K != 0) {
    if (method == "ilp") {
      stop("K must be a divider of the number of elements when using the ILP method. ",
           "(Try out method = 'exchange'.)")
    }
  }



  if (method == "ilp") {
    check_if_solver_is_available()
    if (!objective %in% c("distance", "diversity", "dispersion")) {
      stop("The ILP method is only available for the diversity and dispersion objectives.")
    }
    if (argument_exists(repetitions)) {
      stop("Do not use argument `repetitions` when using the ILP method.")
    }
  }

  if (!inherits(objective, "function")) {
    validate_input(objective, "objective", input_set = c(
      "distance", 
      "diversity", 
      "average-diversity", 
      "dispersion",
      "variance", 
      "kplus"
      ), 
      len = 1, not_na = TRUE)
    if (objective %in% c("variance", "kplus") && method == "ilp") {
      stop("You cannot use this function to optimally maximize the kmeans or kplus criterion. Use optimal_anticlustering().")
    }
    if (objective %in% c("variance", "kplus") && is_distance_matrix(x)) {
        stop("You cannot use a distance matrix with the objective 'variance' or 'kplus'.")
    }
  }

  if (argument_exists(categories) && method == "ilp") {
    stop("The ILP method cannot incorporate categorical restrictions.")
  }
  return(invisible(NULL))
}


#' A function for input validation
#'
#' @param obj The object that undergoes validation
#' @param argument_name A string indicating the name of the object
#'   This name is used when an error is thrown so the user
#'   is informed on the cause of the error.
#' @param len Optional numeric vector for objects having a length
#'   (mostly for vectors). Tests via `NROW`, so can also test matrix-like 
#'   objects.
#' @param greater_than Optional scalar indicating if numeric input has
#'   to be greater than a specified number.
#' @param must_be_integer Optional logical vector indicating if numeric
#'   input has to be integer.
#' @param groupsize Optional argument how many groups a grouping variable
#'   consist of.
#' @param input_set Optional argument specifying a set of values an
#'   argument can take.
#' @param objmode The required mode of \code{obj}
#' @param not_na Boolean to indicate whether NA input is forbidden
#'   (TRUE means that NA is not allowed)
#' @param smaller_than A value for numeric input that must not be exceeded
#'
#' @return NULL
#'
#' @noRd

validate_input <- function(obj, argument_name, len = NULL, greater_than = NULL,
                           must_be_integer = FALSE, groupsize = NULL, input_set = NULL, 
                           objmode = NULL, not_na = FALSE, smaller_than = NULL, 
                           not_function = NULL) {
                           
                           
  self_validation(argument_name, len, greater_than,
                  must_be_integer, groupsize, input_set,
                  objmode, not_na, not_function)

  argument_name <- paste("Argument", argument_name)
  
  if (argument_exists(not_function) && not_function == TRUE) {
    if (inherits(obj, "function")) {
      stop(argument_name, "must not be a function")
    }
  }
   
  
  ## - Check length of input
  if (argument_exists(len)) {
    if (NROW(obj) != len) {
      stop(argument_name, " must have length ", len)
    }
  }

  ## - Check mode of input
  if (argument_exists(objmode)) {
    if (mode(obj) != objmode) {
      stop(argument_name, " must be ", objmode, ", but is ", mode(obj))
    }
  }
  
  ## - Check if input has to be greater than some value
  if (argument_exists(greater_than)) {
    if (any(obj <= greater_than)) {
      stop(argument_name, " must be greater than ", greater_than)
    }
  }
  
  ## - Check if input has to be greater than some value
  if (argument_exists(smaller_than)) {
    if (any(obj >= smaller_than)) {
      stop(argument_name, " must be smaller than ", smaller_than)
    }
  }
  
  if (not_na == TRUE) {
    if (sum(is.na(obj) >= 1)) {
      stop(argument_name, " must not be NA, but contains NA")
    }
  }
  
  ## - Check if input has to be integer
  if (must_be_integer == TRUE && any(obj %% 1 != 0)) {
    stop(argument_name, " must be integer")
  }
  ## - Check if correct number of groups is provided
  if (argument_exists(groupsize)) {
    if (length(table(obj)[table(obj) != 0]) != groupsize) {
      stop(argument_name, " must consist of exactly ", groupsize,
           " groups with more than 0 observations.")
    }
  }

  ## - Check if argument matches a predefined input set
  if (argument_exists(input_set)) {
    if (!obj %in% input_set) {
      stop(argument_name, " can either be set to '",
           paste(input_set, collapse = "' or '"), "'")
    }
  }

  return(invisible(NULL))
}

## Validate feature input
validate_data_matrix <- function(x) {
  x <- as.matrix(x)
  if (mode(x) != "numeric") {
    stop("Your data (the first argument `x`) should only contain numeric entries, but this is not the case.")
  }
  if (any(is.na(x))) {
    stop("Your data contains `NA`. I cannot proceed because ",
         "I cannot estimate similarity for data that has missing values. Sorry!")
  }
}

## Validate input for the `validate_input` function (these errors are
## not for users, but only for developers)
self_validation <- function(argument_name, len, greater_than,
                            must_be_integer, groupsize,
                            input_set, objmode, not_na, not_function) {

  if (argument_exists(not_function)) {
    stopifnot(not_function %in% c(TRUE, FALSE))
  }
                            
  if (argument_exists(len)) {
    stopifnot(class(len) %in% c("numeric", "integer"))
    stopifnot(length(len) == 1)
    stopifnot(len >= 0)
    stopifnot(len %% 1 == 0)
  }

  if (argument_exists(greater_than)) {
    stopifnot(length(greater_than) == 1)
    stopifnot(class(greater_than) %in% c("numeric", "integer"))
  }

  stopifnot(length(must_be_integer) == 1)
  stopifnot(must_be_integer %in% c(TRUE, FALSE))
  stopifnot(length(not_na) == 1)
  stopifnot(not_na %in% c(TRUE, FALSE))

  if (argument_exists(groupsize)) {
    stopifnot(mode(groupsize) == "numeric")
    stopifnot(length(groupsize) == 1)
  }

  if ((argument_exists(input_set) && !argument_exists(len)) ||
      (argument_exists(input_set) && len != 1))  {
    stop("If an input set is passed, argument len must be 1 ",
         "(this message should not be seen by users of the package).")
  }

  if (argument_exists(objmode)) {
    stopifnot(mode(objmode) == "character")
    stopifnot(length(objmode) == 1)
  }

  return(invisible(NULL))
}

argument_exists <- function(arg) {
  !is.null(arg)
}


# Test that the number of features is a multiplier of unique
# anticlusters. I think this function is only used in test cases.
legal_number_of_clusters <- function(features, clusters) {
  ## 1. correct number of clusters assignments?
  if (length(clusters) != nrow(features))
    stop("The number of cluster assignments and the number of features have to match.")
  n_anticlusters <- length(unique(clusters))
  ## 2. More than 1 disctinct cluster?
  if (n_anticlusters <= 1)
    stop("There have to be at least two different anticlusters")
  ## 3. Do all clusters occur equally often?
  if (!all(table(clusters) == table(clusters)[1]))
    stop("Each clusters must occur equally often")
  ## 4. Probably redundant to 3:
  if (nrow(features) %% n_anticlusters != 0)
    stop("The number of elements is not a multiplier of the number of anticlusters")
  invisible(NULL)
}


input_validation_matching <- function(
  x, p, 
  match_between, 
  match_within, 
  match_extreme_first, 
  target_group
) {
  validate_data_matrix(x)
  N <- nrow(as.matrix(x))
  validate_input(
    p, "p", 
    len = 1, 
    must_be_integer = TRUE, 
    not_na = TRUE
  )
  if (argument_exists(match_between)) {
    validate_input(
      match_between, 
      "match_between", 
      len = N
    )
  }
  if (argument_exists(match_within)) {
    validate_input(
      match_within, 
      "match_within", 
      len = N
    )
  }
  validate_input(
    match_extreme_first, 
    "match_extreme_first", 
    len = 1, 
    input_set = c(TRUE, FALSE), 
    not_na = TRUE
  )
  if (argument_exists(target_group)) {
    validate_input(
      target_group, 
      "target_group", 
      len = 1, 
      input_set = c("smallest", "diverse", "none"), 
      not_na = TRUE
    )
  }
}

# Check if a solver package can be used
check_if_solver_is_available <- function() {
  glpk_available <- requireNamespace("Rglpk", quietly = TRUE)
  symphony_available <- requireNamespace("Rsymphony", quietly = TRUE)
  lpSolve_available <- requireNamespace("lpSolve", quietly = TRUE)
  gurobi_available <- requireNamespace("gurobi", quietly = TRUE)
  no_solver_available <- !glpk_available && !symphony_available && !lpSolve_available && !gurobi_available
  
  if (no_solver_available) {
    stop("\n\nAn exact method was requested, but no ILP solver is ",
         "available. For example, you could install the GNU linear programming kit: \n\n",
         "- On windows, visit ",
         "http://gnuwin32.sourceforge.net/packages/glpk.htm \n\n",
         "- Use homebrew to install it on mac, 'brew install glpk' \n\n",
         "- 'sudo apt install libglpk-dev' on Ubuntu ",
         "\n\nThen, install the Rglpk package via ",
         "`install.packages('Rglpk')`.\n \n Another possibilty is to install the R package 'Rsymphony' ",
         "and the SYMPHONY ILP solver (https://github.com/coin-or/SYMPHONY), or the R package `lpSolve`.")
  }
  return(invisible(NULL))
}
m-Py/anticlust documentation built on April 13, 2025, 11:17 p.m.