Nothing
#' \code{fill_NA} function for the imputations purpose.
#'
#' @description
#' Regular imputations to fill the missing data.
#' Non missing independent variables are used to approximate a missing observations for a dependent variable.
#' Quantitative models were built under Rcpp packages and the C++ library Armadillo.
#'
#' @param x a numeric matrix or data.frame/data.table (factor/character/numeric/logical) - variables
#' @param model a character - posibble options ("lda","lm_pred","lm_bayes","lm_noise")
#' @param posit_y an integer/character - a position/name of dependent variable
#' @param posit_x an integer/character vector - positions/names of independent variables
#' @param w a numeric vector - a weighting variable - only positive values, Default:NULL
#' @param logreg a boolean - if dependent variable has log-normal distribution (numeric). If TRUE log-regression is evaluated and then returned exponential of results., Default: FALSE
#' @param ridge a numeric - a value added to diagonal elements of the x'x matrix, Default:1e-5
#'
#' @return load imputations in a numeric/logical/character/factor (similar to the input type) vector format
#'
#' @note
#' There is assumed that users add the intercept by their own.
#' The miceFast module provides the most efficient environment, the second recommended option is to use data.table and the numeric matrix data type.
#' The lda model is assessed only if there are more than 15 complete observations
#' and for the lms models if number of independent variables is smaller than number of observations.
#'
#' @seealso \code{\link{fill_NA_N}} \code{\link{VIF}}
#'
#' @examples
#' library(miceFast)
#' library(dplyr)
#' library(data.table)
#' ### Data
#' # airquality dataset with additional variables
#' data(air_miss)
#' ### Intro: dplyr
#' # IMPUTATIONS
#' air_miss <- air_miss %>%
#' # Imputations with a grouping option (models are separately assessed for each group)
#' # taking into account provided weights
#' group_by(groups) %>%
#' do(mutate(., Solar_R_imp = fill_NA(
#' x = .,
#' model = "lm_pred",
#' posit_y = "Solar.R",
#' posit_x = c("Wind", "Temp", "Intercept"),
#' w = .[["weights"]]
#' ))) %>%
#' ungroup() %>%
#' # Imputations - discrete variable
#' mutate(x_character_imp = fill_NA(
#' x = .,
#' model = "lda",
#' posit_y = "x_character",
#' posit_x = c("Wind", "Temp")
#' )) %>%
#' # logreg was used because almost log-normal distribution of Ozone
#' # imputations around mean
#' mutate(Ozone_imp1 = fill_NA(
#' x = .,
#' model = "lm_bayes",
#' posit_y = "Ozone",
#' posit_x = c("Intercept"),
#' logreg = TRUE
#' )) %>%
#' # imputations using positions - Intercept, Temp
#' mutate(Ozone_imp2 = fill_NA(
#' x = .,
#' model = "lm_bayes",
#' posit_y = 1,
#' posit_x = c(4, 6),
#' logreg = TRUE
#' )) %>%
#' # multiple imputations (average of x30 imputations)
#' # with a factor independent variable, weights and logreg options
#' mutate(Ozone_imp3 = fill_NA_N(
#' x = .,
#' model = "lm_noise",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .[["weights"]],
#' logreg = TRUE,
#' k = 30
#' )) %>%
#' mutate(Ozone_imp4 = fill_NA_N(
#' x = .,
#' model = "lm_bayes",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .[["weights"]],
#' logreg = TRUE,
#' k = 30
#' )) %>%
#' group_by(groups) %>%
#' do(mutate(., Ozone_imp5 = fill_NA(
#' x = .,
#' model = "lm_pred",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .[["weights"]],
#' logreg = TRUE
#' ))) %>%
#' do(mutate(., Ozone_imp6 = fill_NA_N(
#' x = .,
#' model = "pmm",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .[["weights"]],
#' logreg = TRUE,
#' k = 20
#' ))) %>%
#' ungroup() %>%
#' # Average of a few methods
#' mutate(Ozone_imp_mix = rowMeans(select(., starts_with("Ozone_imp")))) %>%
#' # Protecting against collinearity or low number of observations - across small groups
#' # Be carful when using a grouping option
#' # because of lack of protection against collinearity or low number of observations.
#' # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
#' group_by(groups) %>%
#' do(mutate(., Ozone_chac_imp = tryCatch(
#' fill_NA(
#' x = .,
#' model = "lda",
#' posit_y = "Ozone_chac",
#' posit_x = c(
#' "Intercept",
#' "Month",
#' "Day",
#' "Temp",
#' "x_character_imp"
#' ),
#' w = .[["weights"]]
#' ),
#' error = function(e) .[["Ozone_chac"]]
#' ))) %>%
#' ungroup()
#'
#' # Sample of results
#' air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#'
#' ### Intro: data.table
#' # IMPUTATIONS
#' # Imputations with a grouping option (models are separately assessed for each group)
#' # taking into account provided weights
#' data(air_miss)
#' setDT(air_miss)
#' air_miss[, Solar_R_imp := fill_NA_N(
#' x = .SD,
#' model = "lm_bayes",
#' posit_y = "Solar.R",
#' posit_x = c("Wind", "Temp", "Intercept"),
#' w = .SD[["weights"]],
#' k = 100
#' ), by = .(groups)] %>%
#' # Imputations - discrete variable
#' .[, x_character_imp := fill_NA(
#' x = .SD,
#' model = "lda",
#' posit_y = "x_character",
#' posit_x = c("Wind", "Temp", "groups")
#' )] %>%
#' # logreg was used because almost log-normal distribution of Ozone
#' # imputations around mean
#' .[, Ozone_imp1 := fill_NA(
#' x = .SD,
#' model = "lm_bayes",
#' posit_y = "Ozone",
#' posit_x = c("Intercept"),
#' logreg = TRUE
#' )] %>%
#' # imputations using positions - Intercept, Temp
#' .[, Ozone_imp2 := fill_NA(
#' x = .SD,
#' model = "lm_bayes",
#' posit_y = 1,
#' posit_x = c(4, 6),
#' logreg = TRUE
#' )] %>%
#' # model with a factor independent variable
#' # multiple imputations (average of x30 imputations)
#' # with a factor independent variable, weights and logreg options
#' .[, Ozone_imp3 := fill_NA_N(
#' x = .SD,
#' model = "lm_noise",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .SD[["weights"]],
#' logreg = TRUE,
#' k = 30
#' )] %>%
#' .[, Ozone_imp4 := fill_NA_N(
#' x = .SD,
#' model = "lm_bayes",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .SD[["weights"]],
#' logreg = TRUE,
#' k = 30
#' )] %>%
#' .[, Ozone_imp5 := fill_NA(
#' x = .SD,
#' model = "lm_pred",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .SD[["weights"]],
#' logreg = TRUE
#' ), .(groups)] %>%
#' .[, Ozone_imp6 := fill_NA_N(
#' x = .SD,
#' model = "pmm",
#' posit_y = "Ozone",
#' posit_x = c("Intercept", "x_character_imp", "Wind", "Temp"),
#' w = .SD[["weights"]],
#' logreg = TRUE,
#' k = 10
#' ), .(groups)] %>%
#' # Average of a few methods
#' .[, Ozone_imp_mix := apply(.SD, 1, mean), .SDcols = Ozone_imp1:Ozone_imp6] %>%
#' # Protecting against collinearity or low number of observations - across small groups
#' # Be carful when using a data.table grouping option
#' # because of lack of protection against collinearity or low number of observations.
#' # There could be used a tryCatch(fill_NA(...),error=function(e) return(...))
#'
#' .[, Ozone_chac_imp := tryCatch(
#' fill_NA(
#' x = .SD,
#' model = "lda",
#' posit_y = "Ozone_chac",
#' posit_x = c(
#' "Intercept",
#' "Month",
#' "Day",
#' "Temp",
#' "x_character_imp"
#' ),
#' w = .SD[["weights"]]
#' ),
#' error = function(e) .SD[["Ozone_chac"]]
#' ), .(groups)]
#'
#' # Sample of results
#' air_miss[which(is.na(air_miss[, 1]))[1:5], ]
#'
#' @name fill_NA
#'
#' @export
fill_NA <- function(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-6) {
if (inherits(x, "data.frame") || inherits(x, "matrix") || inherits(x, "data.table")) {
if (posit_y %in% posit_x) {
stop("the same variable is dependent and indepentent")
}
model <- match.arg(model, c("lm_pred", "lda", "lm_bayes", "lm_noise"))
cols <- colnames(x)
if (is.character(posit_x)) {
posit_x <- pmatch(posit_x, cols)
posit_x <- posit_x[!is.na(posit_x)]
if (length(posit_x) == 0) stop("posit_x is empty")
} else {
stopifnot(posit_x %in% seq_len(dim(x)[2]))
}
if (is.character(posit_y)) {
posit_y <- pmatch(posit_y, cols)
posit_y <- posit_y[!is.na(posit_y)]
if (length(posit_y) == 0) stop("posit_y is empty")
} else {
stopifnot(posit_y %in% seq_len(dim(x)[2]))
}
UseMethod("fill_NA", x)
} else {
stop("wrong data type - it should be data.frame, matrix or data.table")
}
}
#' @describeIn fill_NA S3 method for data.frame
fill_NA.data.frame <- function(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-6) {
ww <- if (is.null(w)) vector() else w
yy <- x[[posit_y]]
yy_class <- class(yy)
is_factor_y <- yy_class == "factor"
is_character_y <- yy_class == "character"
is_numeric_y <- (yy_class == "numeric") || (yy_class == "integer") || (yy_class == "logical")
all_pos_y <- FALSE
if (is_numeric_y) {
all_pos_y <- !any(yy < 0, na.rm = TRUE)
}
if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
stop("logreg works only for a non-negative numeric dependent variable and lm models")
} else if (all_pos_y && logreg) {
yy <- log(yy + 1e-8)
}
x_small <- x[, posit_x]
types <- lapply(x_small, class)
x_ncols <- length(posit_x)
p_x_factor_character <- which(unlist(lapply(types, function(i) !all(is.na(match(c("factor", "character"), i))))))
len_p_x_factor_character <- length(p_x_factor_character)
xx <- vector("list", 2)
if (len_p_x_factor_character > 0) {
posit_fc <- posit_x[p_x_factor_character]
x_fc <- x[, posit_fc, drop = FALSE]
x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
xx[[1]] <- x_fc
}
if (x_ncols > len_p_x_factor_character) {
posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
x_ni <- as.matrix(x[, posit_ni, drop = FALSE])
xx[[2]] <- x_ni
}
xx <- do.call(cbind, xx[!is.null(xx)])
if (is_factor_y) {
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- factor(l[f])
} else if (is_character_y) {
yy <- if (model != "lda") factor(yy, levels = sort(as.numeric(unique(yy)))) else factor(yy)
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- l[f]
} else if (is_numeric_y) {
yy <- as.numeric(yy)
ff <- fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)
if (logreg && (model != "lda")) {
ff <- exp(ff)
}
}
return(as.vector(ff))
}
#' @describeIn fill_NA s3 method for data.table
fill_NA.data.table <- function(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-6) {
ww <- if (is.null(w)) vector() else w
yy <- x[[posit_y]]
yy_class <- class(yy)
is_factor_y <- yy_class == "factor"
is_character_y <- yy_class == "character"
is_numeric_y <- (yy_class == "numeric") || (yy_class == "integer") || (yy_class == "logical")
all_pos_y <- FALSE
if (is_numeric_y) {
all_pos_y <- !any(yy < 0, na.rm = TRUE)
}
if ((is_character_y || is_factor_y || (model == "lda")) && logreg) {
stop("logreg works only for a non-negative numeric dependent variable and lm models")
} else if (all_pos_y && logreg) {
yy <- log(yy + 1e-8)
}
x_small <- x[, posit_x, with = FALSE]
types <- lapply(x_small, class)
x_ncols <- length(posit_x)
p_x_factor_character <- which(unlist(lapply(types, function(i) !all(is.na(match(c("factor", "character"), i))))))
len_p_x_factor_character <- length(p_x_factor_character)
xx <- vector("list", 2)
if (len_p_x_factor_character > 0) {
posit_fc <- posit_x[p_x_factor_character]
x_fc <- x[, posit_fc, with = FALSE]
x_fc <- model.matrix.lm(~., x_fc, na.action = "na.pass")[, -1]
xx[[1]] <- x_fc
}
if (x_ncols > len_p_x_factor_character) {
posit_ni <- setdiff(posit_x, posit_x[p_x_factor_character])
x_ni <- as.matrix(x[, posit_ni, with = FALSE])
xx[[2]] <- x_ni
}
xx <- do.call(cbind, xx[!is.null(xx)])
if (is_factor_y) {
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- factor(l[f])
} else if (is_character_y) {
yy <- if (model != "lda") factor(yy, levels = sort(as.numeric(unique(yy)))) else factor(yy)
l <- levels(yy)
yy <- as.numeric(yy)
f <- round(fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge))
f[f <= 0] <- 1
f[f > length(l)] <- length(l)
ff <- l[f]
} else if (is_numeric_y) {
yy <- as.numeric(yy)
ff <- fill_NA_(cbind(yy, xx), model, 1, 2:(ncol(xx) + 1), ww, ridge)
if (logreg && (model != "lda")) {
ff <- exp(ff)
}
}
return(as.vector(ff))
}
#' @describeIn fill_NA S3 method for matrix
fill_NA.matrix <- function(x, model, posit_y, posit_x, w = NULL, logreg = FALSE, ridge = 1e-6) {
ww <- if (is.null(w)) vector() else w
all_pos_y <- !any(x[, posit_y] < 0, na.rm = TRUE)
logreg_con <- logreg && all_pos_y && (model != "lda")
if (logreg_con) {
x[, posit_y] <- log(x[, posit_y] + 1e-8)
}
ff <- fill_NA_(x, model, posit_y, posit_x, ww, ridge)
if (logreg_con) {
ff <- exp(ff)
}
return(as.vector(ff))
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.