R/utils-delete.R
In missMethods: Methods for Missing Data

## Indices for deleting -------------------------------------------------------

#' Get NA indices
#'
#' This function determines the (row) indices, for NA values.
#'
#' If `n_mis_stochastic` is false and `prob` is not NULL, the call to `sample()`
#' can be problematic. Details are documented in the test file (weighted
#' sampling without replacement).
#'
#' @param n_mis_stochastic Logical; as always (see for example `delete_MCAR()`).
#' @param n integer(1); number of objects
#' @param p probability for a value to be missing. 0 <= p <= 1
#' @param prob sampling weights for the indices. Probability for a index to be
#'   missing is proportional to `prob`. For too high values of `p` some of these
#'   can be scaled down. 0 and `Inf` are possible. Indices with prob == 0 will
#'   never be returned. Indices with infinite prob will be considered first.
#'   Only if there are less indices with infinite prob than `n_mis`, the other
#'   indices will be considered for missing values.
#' @param n_mis The number of missing values. Normally, `p` should be supplied
#'   instead of `n_mis`.
#' @param indices Indices, which could be set NA.
#' @param warn Should warnings be given?
#'
#' @return (row) indices for setting NA
#'
#' @noRd
get_NA_indices <- function(n_mis_stochastic, n = length(indices), p = n_mis / n,
                           prob = NULL,
                           n_mis = round(n * p),
                           indices = seq_len(n),
                           warn = getOption("missMethods.warn.too.high.p")) {
  stopifnot(
    is.logical(n_mis_stochastic), length(n_mis_stochastic) == 1L,
    missing(p) || (is.numeric(p) && length(p) == 1L),
    is.null(prob) || n == length(prob),
    is.numeric(n_mis), length(n_mis) == 1L,
    n == length(indices)
  )

  ## Check for too high p or n_mis (and just it) ------------------------------
  n_mis_max <- sum(prob > 0)
  if (!is.null(prob) && n_mis_max < n * p) {
    max_p <- n_mis_max / n
    if (warn) {
      warning(
        "p = ", p, " is too high for the chosen mechanims (and data);",
        "it will be reduced to ", max_p
      )
    }
    p <- max_p
    n_mis <- n_mis_max
  }

  ## Check for Inf in prob ----------------------------------------------------
  prob_inf <- is.infinite(prob)
  if (!is.null(prob) && any(prob_inf)) {
    n_inf <- sum(prob_inf)
    if (n_inf >= n_mis) {
      # Only indices with prob == Inf should have missing values
      p <- p * n / n_inf
      return(Recall(
        n_mis_stochastic,
        p = p, n_mis = n_mis,
        indices = which(prob_inf), prob = NULL
      ))
    } else { # Less prob_inf then missing values
      # All indices with prob == Inf should be NA
      na_indices_inf <- which(prob_inf)
      # Remove this indices and probs
      indices <- indices[!prob_inf]
      prob <- prob[!prob_inf]
      # Adjust p and n_mis
      p <- (n * p - n_inf) / (n - n_inf)
      n_mis <- n_mis - n_inf
      further_na_indices <- Recall(n_mis_stochastic, p = p, prob = prob, n_mis = n_mis, indices = indices)
      return(c(na_indices_inf, further_na_indices))
    }
  }

  ## Check for too high prob values -------------------------------------------
  prob_scaled <- prob / sum(prob)
  if (!is.null(prob) && any(prob_scaled > 1 / (n * p)) && warn) {
    warning(
      "p or some prob values are too high;",
      " the too high prob values will be scaled down."
    )
  }

  ## Get NA indices -----------------------------------------------------------
  if (n_mis_stochastic) {
    if (is.null(prob)) {
      na_indices <- sample(c(TRUE, FALSE), n, replace = TRUE, prob = c(p, 1 - p))
      # na_indices <- stats::runif(n) < p # old
    } else {
      # First: Normalize prob
      prob <- prob / sum(prob)
      # Do we need to scale prob?
      while (any(prob > 1 / (n * p))) {
        # Scale down to big probs
        to_big <- prob > 1 / (n * p)
        prob[to_big] <- 1 / (n * p)
        # Scale up other probs (if they are not 0 and not already scaled)
        scaled <- prob >= 1 / (n * p)
        neq_0_not_scaled <- prob > 0 & !scaled
        if (any(neq_0_not_scaled)) {
          prob[neq_0_not_scaled] <- prob[neq_0_not_scaled] *
            (1 - sum(scaled) * 1 / (n * p)) / sum(prob[neq_0_not_scaled])
        } else {
          # We would have to delete values from objects with prob == 0!
          # This should never happen (check for too high p!)!
          # prob[prob == 0] <- (1 - sum(scaled) * 1 / (n * p)) / sum(prob == 0)
          break()
        }
      }
      # After this loop all probs should be <= 1/(n*p) and sum(prob) == 1
      if (any(prob > 1 / (n * p)) || isFALSE(all.equal(sum(prob), 1))) {
        stop("We have a problem with prob; did you specify 'p' correctly?")
      }

      prob <- prob * n * p

      # get NA indices
      na_indices <- stats::runif(n) < prob
    }
    na_indices <- indices[na_indices]
  } else { # not n_mis_stochastic
    na_indices <- resample(indices, n_mis, prob = prob)
  }
  na_indices
}


## Finding groups -------------------------------------------------------------

find_groups <- function(x, cutoff_fun, prop, use_lpSolve,
                        ordered_as_unordered, ...) {
  unique_x <- unique(x)
  nr_unique <- length(unique_x)
  if (nr_unique == 1) {
    warning("grouping not possible, because x is constant")
    groups <- list(g1 = seq_along(x), g2 = NULL)
  } else if (is.numeric(x) || (is.ordered(x) && !ordered_as_unordered)) {
    cutoff_fun <- match.fun(cutoff_fun)
    groups <- find_groups_by_cutoff_val(x, cutoff_fun(x, ...))
  } else { # fallback for not (numeric or ordered factor)
    groups <- find_groups_by_prop(x, prop = prop, use_lpSolve)
  }
  groups
}


find_groups_by_cutoff_val <- function(x, cutoff_val) {
  # get rows below the cutoff value
  below <- x < cutoff_val
  # get rows greater than or equal to the cutoff value ("above")
  above <- which(!below)
  below <- which(below)
  # cure problem: no values below (but only if cols_ctrl[i] is not constant!)
  if ((length(below) == 0L) && length(unique(x)) != 1L) {
    # instead of only below, now accept below and equal
    below <- x <= cutoff_val
    above <- which(!below) # and above will be only above (not equal anymore)
    below <- which(below)
  }
  list(g1 = below, g2 = above)
}


find_groups_by_prop <- function(x, prop, use_lpSolve = TRUE) {
  ux <- unique(x) # order: first occurring in x
  nr_ux <- tabulate(match(x, ux))
  if (use_lpSolve) {
    if (requireNamespace("lpSolve", quietly = TRUE)) {
      solution <- lpSolve::lp(
        direction = "min", objective.in = nr_ux,
        const.mat = matrix(nr_ux, nrow = 1),
        const.dir = ">=",
        const.rhs = prop * length(x),
        all.bin = TRUE
      )
      if (solution$status == 0) {
        g1_ux <- ux[solution$solution > 0]
      } else {
        stop("lpSolve found no solution")
      }
    } else { # no lpSolve installed
      stop("Package \"lpSolve\" needed. Either install it or use \"use_lpSolve = FALSE\".")
    }
  } else {
    g1_ux <- cumsum(nr_ux) <= ceiling(prop * length(x))
    g1_ux <- ux[g1_ux]
  }
  # g1 should not be empty, but only if there is more than one different value in x
  if (length(g1_ux) == 0L && length(unique(x)) != 1) {
    g1_ux <- ux[1]
  }
  find_groups_by_values(x, g1_ux)
}


find_groups_by_values <- function(x, values) {
  g1 <- x %in% values
  g2 <- which(!g1)
  g1 <- which(g1)
  list(g1 = g1, g2 = g2)
}


## More helpers ---------------------------------------------------------------

calc_n_mis_g1 <- function(nr_g1, p_mis_g1,
                          nr_g2, n_mis, x) {
  if (n_mis == 0L) {
    n_mis_g1 <- 0L
  } else if (nr_g2 == 0L) {
    n_mis_g1 <- n_mis
  } else {
    n_mis_g1_ceil <- ceiling(nr_g1 * p_mis_g1)
    n_mis_g1_floor <- floor(nr_g1 * p_mis_g1)
    odds_ceil <- n_mis_g1_ceil / nr_g1 /
      ((n_mis - n_mis_g1_ceil) / nr_g2)
    odds_floor <- n_mis_g1_floor / nr_g1 /
      ((n_mis - n_mis_g1_floor) / nr_g2)
    if (abs(1 / x - odds_ceil) < abs(1 / x - odds_floor)) {
      n_mis_g1 <- n_mis_g1_ceil
    } else {
      n_mis_g1 <- n_mis_g1_floor
    }
  }
  n_mis_g1
}