Nothing
#' \code{fill_NA} function for the imputations purpose.
#'
#' @description
#' Regular imputations to fill the missing data.
#' Non missing independent variables are used to approximate a missing observations for a dependent variable.
#' Quantitative models were built under Rcpp packages and the C++ library Armadillo.
#'
#' @param x a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
#' @param model a character - possible options ("lda","lm_pred","lm_bayes","lm_noise")
#' @param posit_y an integer/character - a position/name of dependent variable
#' @param posit_x an integer/character vector - positions/names of independent variables
#' @param w a numeric vector - a weighting variable - only positive values, Default:NULL
#' @param logreg a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
#' @param ridge a numeric - a value added to diagonal elements of the x'x matrix, Default: 1e-6
#'
#' @return load imputations in a numeric/logical/character/factor (similar to the input type) vector format
#'
#' @note
#' There is assumed that users add the intercept by their own.
#' The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type.
#' The lda model is assessed only if there are more than 15 complete observations
#' and for the lms models if number of independent variables is smaller than number of observations.
#'
#' @seealso \code{\link{fill_NA_N}} \code{\link{VIF}} \code{vignette("miceFast-intro", package = "miceFast")}
#'
#' @examples
#' library(miceFast)
#' library(dplyr)
#' library(data.table)
#'
#' data(air_miss)
#'
#' # dplyr: continuous variable with Bayesian linear model
#' air_miss %>%
#' mutate(Ozone_imp = fill_NA(
#' x = ., model = "lm_bayes",
#' posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
#' ))
#'
#' # dplyr: categorical variable with LDA
#' air_miss %>%
#' mutate(x_char_imp = fill_NA(
#' x = ., model = "lda",
#' posit_y = "x_character", posit_x = c("Wind", "Temp")
#' ))
#'
#' # dplyr: grouped imputation with weights
#' air_miss %>%
#' group_by(groups) %>%
#' do(mutate(., Solar_R_imp = fill_NA(
#' x = ., model = "lm_pred",
#' posit_y = "Solar.R",
#' posit_x = c("Wind", "Temp", "Intercept"),
#' w = .[["weights"]]
#' ))) %>%
#' ungroup()
#'
#' # data.table
#' data(air_miss)
#' setDT(air_miss)
#' air_miss[, Ozone_imp := fill_NA(
#' x = .SD, model = "lm_bayes",
#' posit_y = "Ozone", posit_x = c("Solar.R", "Wind", "Temp")
#' )]
#'
#' # data.table: grouped
#' air_miss[, Solar_R_imp := fill_NA(
#' x = .SD, model = "lm_pred",
#' posit_y = "Solar.R",
#' posit_x = c("Wind", "Temp", "Intercept"),
#' w = .SD[["weights"]]
#' ), by = .(groups)]
#'
#' # See the vignette for full examples:
#' # vignette("miceFast-intro", package = "miceFast")
#'
#' @name fill_NA
#'
#' @export
fill_NA <- function(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
ridge = 1e-6
) {
if (
inherits(x, "data.frame") ||
inherits(x, "matrix") ||
inherits(x, "data.table")
) {
if (posit_y %in% posit_x) {
stop("the same variable is dependent and independent")
}
model <- match.arg(model, c("lm_pred", "lda", "lm_bayes", "lm_noise"))
cols <- colnames(x)
if (is.character(posit_x)) {
posit_x <- pmatch(posit_x, cols)
posit_x <- posit_x[!is.na(posit_x)]
if (length(posit_x) == 0) stop("posit_x is empty")
} else {
stopifnot(posit_x %in% seq_len(dim(x)[2]))
}
if (is.character(posit_y)) {
posit_y <- pmatch(posit_y, cols)
posit_y <- posit_y[!is.na(posit_y)]
if (length(posit_y) == 0) stop("posit_y is empty")
} else {
stopifnot(posit_y %in% seq_len(dim(x)[2]))
}
UseMethod("fill_NA", x)
} else {
stop("wrong data type - it should be data.frame, matrix or data.table")
}
}
#' @describeIn fill_NA S3 method for data.frame
fill_NA.data.frame <- function(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
ridge = 1e-6
) {
ww <- if (is.null(w)) vector() else w
yy <- x[[posit_y]]
yy_class <- class(yy)
is_factor_y <- yy_class == "factor"
is_character_y <- yy_class == "character"
is_numeric_y <- (yy_class == "numeric") ||
(yy_class == "integer") ||
(yy_class == "logical")
all_pos_y <- FALSE
if (is_numeric_y) {
all_pos_y <- !any(yy < 0, na.rm = TRUE)
}
if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
stop(
"logreg works only for a non-negative numeric dependent variable and lm models"
)
} else if (all_pos_y && logreg) {
yy <- log(yy + 1e-8)
}
x_small <- x[, posit_x]
types <- lapply(x_small, class)
x_ncols <- length(posit_x)
p_x_factor_character <- which(unlist(lapply(types, function(i) {
!all(is.na(match(c("factor", "character"), i)))
})))
len_p_x_factor_character <- length(p_x_factor_character)
xx <- vector("list", 2)
if (len_p_x_factor_character > 0) {
posit_fc <- posit_x[p_x_factor_character]
x_fc <- x[, posit_fc, drop = FALSE]
x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
xx[[1]] <- x_fc
}
if (x_ncols > len_p_x_factor_character) {
posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
x_ni <- as.matrix(x[, posit_ni, drop = FALSE])
xx[[2]] <- x_ni
}
xx <- do.call(cbind, xx[!is.null(xx)])
if (is_factor_y) {
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- factor(l[f])
} else if (is_character_y) {
yy <- factor(yy)
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- l[f]
} else if (is_numeric_y) {
yy <- as.numeric(yy)
ff <- fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)
if (logreg && (model != "lda")) {
ff <- exp(ff)
}
}
return(as.vector(ff))
}
#' @describeIn fill_NA s3 method for data.table
fill_NA.data.table <- function(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
ridge = 1e-6
) {
ww <- if (is.null(w)) vector() else w
yy <- x[[posit_y]]
yy_class <- class(yy)
is_factor_y <- yy_class == "factor"
is_character_y <- yy_class == "character"
is_numeric_y <- (yy_class == "numeric") ||
(yy_class == "integer") ||
(yy_class == "logical")
all_pos_y <- FALSE
if (is_numeric_y) {
all_pos_y <- !any(yy < 0, na.rm = TRUE)
}
if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
stop(
"logreg works only for a non-negative numeric dependent variable and lm models"
)
} else if (all_pos_y && logreg) {
yy <- log(yy + 1e-8)
}
x_small <- x[, posit_x, with = FALSE]
types <- lapply(x_small, class)
x_ncols <- length(posit_x)
p_x_factor_character <- which(unlist(lapply(types, function(i) {
!all(is.na(match(c("factor", "character"), i)))
})))
len_p_x_factor_character <- length(p_x_factor_character)
xx <- vector("list", 2)
if (len_p_x_factor_character > 0) {
posit_fc <- posit_x[p_x_factor_character]
x_fc <- x[, posit_fc, with = FALSE]
x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
xx[[1]] <- x_fc
}
if (x_ncols > len_p_x_factor_character) {
posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
x_ni <- as.matrix(x[, posit_ni, with = FALSE])
xx[[2]] <- x_ni
}
xx <- do.call(cbind, xx[!is.null(xx)])
if (is_factor_y) {
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- factor(l[f])
} else if (is_character_y) {
yy <- factor(yy)
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- l[f]
} else if (is_numeric_y) {
yy <- as.numeric(yy)
ff <- fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)
if (logreg && (model != "lda")) {
ff <- exp(ff)
}
}
return(as.vector(ff))
}
#' @describeIn fill_NA S3 method for matrix
fill_NA.matrix <- function(
x,
model,
posit_y,
posit_x,
w = NULL,
logreg = FALSE,
ridge = 1e-6
) {
ww <- if (is.null(w)) vector() else w
all_pos_y <- !any(x[, posit_y] < 0, na.rm = TRUE)
logreg_con <- logreg && all_pos_y && (model != "lda")
if (logreg_con) {
x[, posit_y] <- log(x[, posit_y] + 1e-8)
}
ff <- fill_NA_(x, model, posit_y, posit_x, ww, ridge)
if (logreg_con) {
ff <- exp(ff)
}
return(as.vector(ff))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.