R/fill_NA.R

Defines functions fill_NA.matrix fill_NA.data.table fill_NA.data.frame fill_NA

Documented in fill_NA fill_NA.data.frame fill_NA.data.table fill_NA.matrix

#' \code{fill_NA} function for the imputations purpose.
#'
#' @description
#' Regular imputations to fill the missing data.
#' Non missing independent variables are used to approximate a missing observations for a dependent variable.
#' Quantitative models were built under Rcpp packages and the C++ library Armadillo.
#'
#' @param x a numeric matrix or data.frame/data.table (factor/character/numeric/logical)  - variables
#' @param model a character - possible options ("lda","lm_pred","lm_bayes","lm_noise")
#' @param posit_y an integer/character - a position/name of dependent variable
#' @param posit_x an integer/character vector - positions/names of independent variables
#' @param w  a numeric vector - a weighting variable - only positive values, Default:NULL
#' @param logreg a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
#' @param ridge a numeric - a value added to diagonal elements of the x'x matrix, Default: 1e-6
#'
#' @return load imputations in a numeric/logical/character/factor (similar to the input type) vector format
#'
#' @note
#' There is assumed that users add the intercept by their own.
#' The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type.
#' The lda model is assessed only if there are more than 15 complete observations
#' and for the lms models if number of independent variables is smaller than number of observations.
#'
#' @seealso \code{\link{fill_NA_N}}  \code{\link{VIF}}  \code{vignette("miceFast-intro", package = "miceFast")}
#'
#' @examples
#' library(miceFast)
#' library(dplyr)
#' library(data.table)
#'
#' data(air_miss)
#'
#' # dplyr: continuous variable with Bayesian linear model
#' air_miss %>%
#'   mutate(Ozone_imp = fill_NA(
#'     x = ., model = "lm_bayes",
#'     posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
#'   ))
#'
#' # dplyr: categorical variable with LDA
#' air_miss %>%
#'   mutate(x_char_imp = fill_NA(
#'     x = ., model = "lda",
#'     posit_y = "x_character", posit_x = c("Wind", "Temp")
#'   ))
#'
#' # dplyr: grouped imputation with weights
#' air_miss %>%
#'   group_by(groups) %>%
#'   do(mutate(., Solar_R_imp = fill_NA(
#'     x = ., model = "lm_pred",
#'     posit_y = "Solar.R",
#'     posit_x = c("Wind", "Temp", "Intercept"),
#'     w = .[["weights"]]
#'   ))) %>%
#'   ungroup()
#'
#' # data.table
#' data(air_miss)
#' setDT(air_miss)
#' air_miss[, Ozone_imp := fill_NA(
#'   x = .SD, model = "lm_bayes",
#'   posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
#' )]
#'
#' # data.table: grouped
#' air_miss[, Solar_R_imp := fill_NA(
#'   x = .SD, model = "lm_pred",
#'   posit_y = "Solar.R",
#'   posit_x = c("Wind", "Temp", "Intercept"),
#'   w = .SD[["weights"]]
#' ), by = .(groups)]
#'
#' # See the vignette for full examples:
#' # vignette("miceFast-intro", package = "miceFast")
#'
#' @name fill_NA
#'
#' @export

fill_NA <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  ridge = 1e-6
) {
  if (
    inherits(x, "data.frame") ||
      inherits(x, "matrix") ||
      inherits(x, "data.table")
  ) {
    if (posit_y %in% posit_x) {
      stop("the same variable is dependent and independent")
    }
    model <- match.arg(model, c("lm_pred", "lda", "lm_bayes", "lm_noise"))

    cols <- colnames(x)

    if (is.character(posit_x)) {
      posit_x <- pmatch(posit_x, cols)
      posit_x <- posit_x[!is.na(posit_x)]
      if (length(posit_x) == 0) stop("posit_x is empty")
    } else {
      stopifnot(posit_x %in% seq_len(dim(x)[2]))
    }

    if (is.character(posit_y)) {
      posit_y <- pmatch(posit_y, cols)
      posit_y <- posit_y[!is.na(posit_y)]
      if (length(posit_y) == 0) stop("posit_y is empty")
    } else {
      stopifnot(posit_y %in% seq_len(dim(x)[2]))
    }

    UseMethod("fill_NA", x)
  } else {
    stop("wrong data type - it should be data.frame, matrix or data.table")
  }
}

#' @describeIn fill_NA S3 method for data.frame
fill_NA.data.frame <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  ridge = 1e-6
) {
  ww <- if (is.null(w)) vector() else w

  yy <- x[[posit_y]]

  yy_class <- class(yy)

  is_factor_y <- yy_class == "factor"
  is_character_y <- yy_class == "character"
  is_numeric_y <- (yy_class == "numeric") ||
    (yy_class == "integer") ||
    (yy_class == "logical")

  all_pos_y <- FALSE
  if (is_numeric_y) {
    all_pos_y <- !any(yy < 0, na.rm = TRUE)
  }

  if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
    stop(
      "logreg works only for a non-negative numeric dependent variable and lm models"
    )
  } else if (all_pos_y && logreg) {
    yy <- log(yy + 1e-8)
  }

  x_small <- x[, posit_x]
  types <- lapply(x_small, class)
  x_ncols <- length(posit_x)
  p_x_factor_character <- which(unlist(lapply(types, function(i) {
    !all(is.na(match(c("factor", "character"), i)))
  })))
  len_p_x_factor_character <- length(p_x_factor_character)
  xx <- vector("list", 2)

  if (len_p_x_factor_character > 0) {
    posit_fc <- posit_x[p_x_factor_character]
    x_fc <- x[, posit_fc, drop = FALSE]
    x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
    xx[[1]] <- x_fc
  }

  if (x_ncols > len_p_x_factor_character) {
    posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
    x_ni <- as.matrix(x[, posit_ni, drop = FALSE])
    xx[[2]] <- x_ni
  }

  xx <- do.call(cbind, xx[!is.null(xx)])

  if (is_factor_y) {
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- factor(l[f])
  } else if (is_character_y) {
    yy <- factor(yy)
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- l[f]
  } else if (is_numeric_y) {
    yy <- as.numeric(yy)
    ff <- fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)
    if (logreg && (model != "lda")) {
      ff <- exp(ff)
    }
  }

  return(as.vector(ff))
}

#' @describeIn fill_NA s3 method for data.table
fill_NA.data.table <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  ridge = 1e-6
) {
  ww <- if (is.null(w)) vector() else w

  yy <- x[[posit_y]]

  yy_class <- class(yy)

  is_factor_y <- yy_class == "factor"
  is_character_y <- yy_class == "character"
  is_numeric_y <- (yy_class == "numeric") ||
    (yy_class == "integer") ||
    (yy_class == "logical")

  all_pos_y <- FALSE
  if (is_numeric_y) {
    all_pos_y <- !any(yy < 0, na.rm = TRUE)
  }

  if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
    stop(
      "logreg works only for a non-negative numeric dependent variable and lm models"
    )
  } else if (all_pos_y && logreg) {
    yy <- log(yy + 1e-8)
  }

  x_small <- x[, posit_x, with = FALSE]
  types <- lapply(x_small, class)
  x_ncols <- length(posit_x)
  p_x_factor_character <- which(unlist(lapply(types, function(i) {
    !all(is.na(match(c("factor", "character"), i)))
  })))
  len_p_x_factor_character <- length(p_x_factor_character)
  xx <- vector("list", 2)

  if (len_p_x_factor_character > 0) {
    posit_fc <- posit_x[p_x_factor_character]
    x_fc <- x[, posit_fc, with = FALSE]
    x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
    xx[[1]] <- x_fc
  }

  if (x_ncols > len_p_x_factor_character) {
    posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
    x_ni <- as.matrix(x[, posit_ni, with = FALSE])
    xx[[2]] <- x_ni
  }

  xx <- do.call(cbind, xx[!is.null(xx)])

  if (is_factor_y) {
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- factor(l[f])
  } else if (is_character_y) {
    yy <- factor(yy)
    l <- levels(yy)
    yy <- as.numeric(yy)
    f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
    f[f <= 0] <- 1
    f[f > length(l)] <- length(l)
    ff <- l[f]
  } else if (is_numeric_y) {
    yy <- as.numeric(yy)
    ff <- fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)
    if (logreg && (model != "lda")) {
      ff <- exp(ff)
    }
  }

  return(as.vector(ff))
}

#' @describeIn fill_NA S3 method for matrix
fill_NA.matrix <- function(
  x,
  model,
  posit_y,
  posit_x,
  w = NULL,
  logreg = FALSE,
  ridge = 1e-6
) {
  ww <- if (is.null(w)) vector() else w

  all_pos_y <- !any(x[, posit_y] < 0, na.rm = TRUE)
  logreg_con <- logreg && all_pos_y && (model != "lda")

  if (logreg_con) {
    x[, posit_y] <- log(x[, posit_y] + 1e-8)
  }
  ff <- fill_NA_(x, model, posit_y, posit_x, ww, ridge)
  if (logreg_con) {
    ff <- exp(ff)
  }

  return(as.vector(ff))
}

Try the miceFast package in your browser

Any scripts or data that you put into this service are public.

miceFast documentation built on Feb. 26, 2026, 5:06 p.m.