R/fill_NA_N.R

Defines functions fill_NA_N.matrix fill_NA_N.data.table fill_NA_N.data.frame fill_NA_N

Documented in fill_NA_N fill_NA_N.data.frame fill_NA_N.data.table fill_NA_N.matrix

#' \code{fill_NA_N} function for the multiple imputations purpose
#'
#' @description
#' Multiple imputations to fill the missing data.
#' Non missing independent variables are used to approximate a missing observations for a dependent variable.
#' Quantitative models were built under Rcpp packages and the C++ library Armadillo.
#'
#' @param x a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
#' @param model a character - possible options ("lm_bayes","lm_noise","pmm")
#' @param posit_y an integer/character - a position/name of dependent variable
#' @param posit_x an integer/character vector - positions/names of independent variables
#' @param w  a numeric vector - a weighting variable - only positive values, Default: NULL
#' @param k an integer - a number of multiple imputations or for pmm a number of closest points from which a one random value is taken, Default:10
#' @param logreg a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
#' @param ridge a numeric - a value added to diagonal elements of the x'x matrix, Default: 1e-6
#'
#' @return load imputations in a numeric/character/factor (similar to the input type) vector format
#'
#' @note
#' It is assumed that users add the intercept column themselves.
#' The miceFast module provides the most efficient environment; the second recommended option is data.table with a numeric matrix.
#' Only \code{"lm_bayes"}, \code{"lm_noise"}, and \code{"pmm"} models are supported.
#' The model is fitted only when the number of complete observations exceeds the number of independent variables.
#'
#' @seealso \code{\link{fill_NA}} \code{\link{VIF}}  \code{vignette("miceFast-intro", package = "miceFast")}
#'
#' @examples
#' library(miceFast)
#' library(dplyr)
#' library(data.table)
#'
#' data(air_miss)
#'
#' # dplyr: PMM with 20 draws
#' air_miss %>%
#'   mutate(Ozone_pmm = fill_NA_N(
#'     x = ., model = "pmm",
#'     posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp"),
#'     k = 20
#'   ))
#'
#' # dplyr: lm_noise with weights
#' air_miss %>%
#'   mutate(Ozone_imp = fill_NA_N(
#'     x = ., model = "lm_noise",
#'     posit_y = "Ozone",
#'     posit_x = c("Solar.R", "Wind", "Temp"),
#'     w = .[["weights"]],
#'     logreg = TRUE, k = 30
#'   ))
#'
#' # data.table: PMM grouped
#' data(air_miss)
#' setDT(air_miss)
#' air_miss[, Ozone_pmm := fill_NA_N(
#'   x = .SD, model = "pmm",
#'   posit_y = "Ozone",
#'   posit_x = c("Wind", "Temp", "Intercept"),
#'   k = 20
#' ), by = .(groups)]
#'
#' # See the vignette for full examples:
#' # vignette("miceFast-intro", package = "miceFast")
#'
#' @name fill_NA_N
#'
#' @export

fill_NA_N <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-6
) {
  if (
    inherits(x, "data.frame") ||
      inherits(x, "matrix") ||
      inherits(x, "data.table")
  ) {
    if (posit_y %in% posit_x) {
      stop("the same variable is dependent and independent")
    }
    model <- match.arg(model, c("lm_bayes", "lm_noise", "pmm"))

    cols <- colnames(x)

    if (is.character(posit_x)) {
      posit_x <- pmatch(posit_x, cols)
      posit_x <- posit_x[!is.na(posit_x)]
      if (length(posit_x) == 0) stop("posit_x is empty")
    } else {
      stopifnot(posit_x %in% seq_along(x))
    }

    if (is.character(posit_y)) {
      posit_y <- pmatch(posit_y, cols)
      if (length(posit_y) == 0) stop("posit_y is empty")
    } else {
      stopifnot(posit_y %in% seq_along(x))
    }

    UseMethod("fill_NA_N", x)
  } else {
    stop("wrong data type - it should be data.frame, matrix or data.table")
  }
}

#' @describeIn fill_NA_N s3 method for data.frame

fill_NA_N.data.frame <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-6
) {
  ww <- if (is.null(w)) vector() else w

  yy <- x[[posit_y]]

  yy_class <- class(yy)

  is_factor_y <- yy_class == "factor"
  is_character_y <- yy_class == "character"
  is_numeric_y <- (yy_class == "numeric") ||
    (yy_class == "integer") ||
    (yy_class == "logical")

  all_pos_y <- FALSE
  if (is_numeric_y) {
    all_pos_y <- !any(yy < 0, na.rm = TRUE)
  }

  if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
    stop(
      "logreg works only for a non-negative numeric dependent variable and lm models"
    )
  } else if (all_pos_y && logreg) {
    yy <- log(yy + 1e-8)
  }

  x_small <- x[, posit_x]
  types <- lapply(x_small, class)
  x_ncols <- length(posit_x)
  p_x_factor_character <- which(unlist(lapply(types, function(i) {
    !all(is.na(match(c("factor", "character"), i)))
  })))
  len_p_x_factor_character <- length(p_x_factor_character)

  xx <- vector("list", 2)

  if (len_p_x_factor_character > 0) {
    posit_fc <- posit_x[p_x_factor_character]
    x_fc <- x[, posit_fc, drop = FALSE]
    x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
    xx[[1]] <- x_fc
  }

  if (x_ncols > len_p_x_factor_character) {
    posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
    x_ni <- as.matrix(x[, posit_ni, drop = FALSE])
    xx[[2]] <- x_ni
  }

  xx <- do.call(cbind, xx[!is.null(xx)])

  if (is_factor_y) {
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_N_(
      cbind(yy, xx),
      model,
      1,
      2:(ncol(xx) + 1),
      ww,
      k,
      ridge
    ))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- factor(l[f])
  } else if (is_character_y) {
    yy <- factor(yy)
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_N_(
      cbind(yy, xx),
      model,
      1,
      2:(ncol(xx) + 1),
      ww,
      k,
      ridge
    ))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- l[f]
  } else if (is_numeric_y) {
    yy <- as.numeric(yy)
    ff <- fill_NA_N_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, k, ridge)
    if (logreg && (model != "lda")) {
      ff <- exp(ff)
    }
  }

  return(as.vector(ff))
}

#' @describeIn fill_NA_N S3 method for data.table

fill_NA_N.data.table <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-6
) {
  ww <- if (is.null(w)) vector() else w

  yy <- x[[posit_y]]

  yy_class <- class(yy)

  is_factor_y <- yy_class == "factor"
  is_character_y <- yy_class == "character"
  is_numeric_y <- (yy_class == "numeric") ||
    (yy_class == "integer") ||
    (yy_class == "logical")

  all_pos_y <- FALSE
  if (is_numeric_y) {
    all_pos_y <- !any(yy < 0, na.rm = TRUE)
  }

  if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
    stop(
      "logreg works only for a non-negative numeric dependent variable and lm models"
    )
  } else if (all_pos_y && logreg) {
    yy <- log(yy + 1e-8)
  }

  x_small <- x[, posit_x, with = FALSE]
  types <- lapply(x_small, class)
  x_ncols <- length(posit_x)
  p_x_factor_character <- which(unlist(lapply(types, function(i) {
    !all(is.na(match(c("factor", "character"), i)))
  })))
  len_p_x_factor_character <- length(p_x_factor_character)

  xx <- vector("list", 2)

  if (len_p_x_factor_character > 0) {
    posit_fc <- posit_x[p_x_factor_character]
    x_fc <- x[, posit_fc, with = FALSE]
    x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
    xx[[1]] <- x_fc
  }

  if (x_ncols > len_p_x_factor_character) {
    posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
    x_ni <- as.matrix(x[, posit_ni, with = FALSE])
    xx[[2]] <- x_ni
  }

  xx <- do.call(cbind, xx[!is.null(xx)])

  if (is_factor_y) {
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_N_(
      cbind(yy, xx),
      model,
      1,
      2:(ncol(xx) + 1),
      ww,
      k,
      ridge
    ))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- factor(l[f])
  } else if (is_character_y) {
    yy <- factor(yy)
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_N_(
      cbind(yy, xx),
      model,
      1,
      2:(ncol(xx) + 1),
      ww,
      k,
      ridge
    ))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- l[f]
  } else if (is_numeric_y) {
    yy <- as.numeric(yy)
    ff <- fill_NA_N_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, k, ridge)
    if (logreg && (model != "lda")) {
      ff <- exp(ff)
    }
  }

  return(as.vector(ff))
}

#' @describeIn fill_NA_N S3 method for matrix

fill_NA_N.matrix <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  k = 10,
  ridge = 1e-6
) {
  ww <- if (is.null(w)) vector() else w

  all_pos_y <- !any(x[, posit_y] < 0, na.rm = TRUE)
  logreg_con <- logreg && all_pos_y && (model != "lda")

  if (logreg_con) {
    x[, posit_y] <- log(x[, posit_y] + 1e-8)
  }
  ff <- fill_NA_N_(x, model, posit_y, posit_x, ww, k, ridge)
  if (logreg_con) {
    ff <- exp(ff)
  }

  return(as.vector(ff))
}

Try the miceFast package in your browser

Any scripts or data that you put into this service are public.

miceFast documentation built on Feb. 26, 2026, 5:06 p.m.