#' Impute Missing Values
#'
#' @description
#' Missing values are imputed with some representative values and
#' statistical methods.
#'
#' @details
#' imputate_na() creates an imputation class.
#' The `imputation` class includes missing value position, imputed value,
#' and method of missing value imputation, etc.
#' The `imputation` class compares the imputed value with the original value
#' to help determine whether the imputed value is used in the analysis.
#'
#' See vignette("transformation") for an introduction to these concepts.
#'
#' @param .data a data.frame or a \code{\link{tbl_df}}.
#' @param xvar variable name to replace missing value.
#' @param yvar target variable.
#' @param method method of missing values imputation.
#' @param seed integer. the random seed used in mice. only used "mice" method.
#' @param print_flag logical. If TRUE, mice will print running log on console.
#' Use print_flag=FALSE for silent computation. Used only when method is "mice".
#' @param no_attrs logical. If TRUE, return numerical variable or categorical variable.
#' else If FALSE, imputation class.
#' @return An object of imputation class. or numerical variable or categorical variable.
#' if no_attrs is FALSE then return imputation class, else no_attrs is TRUE then return
#' numerical vector or factor.
#' Attributes of imputation class is as follows.
#' \itemize{
#' \item var_type : the data type of predictor to replace missing value.
#' \item method : method of missing value imputation.
#' \itemize{
#' \item predictor is numerical variable.
#' \itemize{
#' \item "mean" : arithmetic mean.
#' \item "median" : median.
#' \item "mode" : mode.
#' \item "knn" : K-nearest neighbors.
#' \item "rpart" : Recursive Partitioning and Regression Trees.
#' \item "mice" : Multivariate Imputation by Chained Equations.
#' }
#' \item predictor is categorical variable.
#' \itemize{
#' \item "mode" : mode.
#' \item "rpart" : Recursive Partitioning and Regression Trees.
#' \item "mice" : Multivariate Imputation by Chained Equations.
#' }
#' }
#' \item na_pos : position of missing value in predictor.
#' \item seed : the random seed used in mice. only used "mice" method.
#' \item type : "missing values". type of imputation.
#' \item message : a message tells you if the result was successful.
#' \item success : Whether the imputation was successful.
#' }
#' @seealso \code{\link{imputate_outlier}}.
#' @examples
#' # Generate data for the example
#' heartfailure2 <- heartfailure
#' heartfailure2[sample(seq(NROW(heartfailure2)), 20), "platelets"] <- NA
#' heartfailure2[sample(seq(NROW(heartfailure2)), 5), "smoking"] <- NA
#'
#' # Replace the missing value of the platelets variable with median
#' imputate_na(heartfailure2, platelets, method = "median")
#'
#' # Replace the missing value of the platelets variable with rpart
#' # The target variable is death_event.
#' # Require rpart package
#' imputate_na(heartfailure2, platelets, death_event, method = "rpart")
#'
#' # Replace the missing value of the smoking variable with mode
#' imputate_na(heartfailure2, smoking, method = "mode")
#'
#' ## using dplyr -------------------------------------
#' library(dplyr)
#'
#' # The mean before and after the imputation of the platelets variable
#' heartfailure2 %>%
#' mutate(platelets_imp = imputate_na(heartfailure2, platelets, death_event,
#' method = "knn", no_attrs = TRUE)) %>%
#' group_by(death_event) %>%
#' summarise(orig = mean(platelets, na.rm = TRUE),
#' imputation = mean(platelets_imp))
#'
#' # If the variable of interest is a numerical variable
#' # Require rpart package
#' platelets <- imputate_na(heartfailure2, platelets, death_event, method = "rpart")
#' platelets
#'
#' @export
#'
imputate_na <- function(.data, xvar, yvar, method, seed, print_flag, no_attrs) {
UseMethod("imputate_na")
}
#' @method imputate_na data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang enquo
#' @export
imputate_na.data.frame <- function(.data, xvar, yvar = NULL,
method = c("mean", "median", "mode", "rpart", "knn", "mice"), seed = NULL,
print_flag = TRUE, no_attrs = FALSE) {
tryCatch(vars <- tidyselect::vars_select(names(.data), !! rlang::enquo(xvar)),
error = function(e) {
pram <- as.character(substitute(xvar))
stop(sprintf("Column %s is unknown", pram))
}, finally = NULL)
tryCatch(target <- tidyselect::vars_select(names(.data), !! rlang::enquo(yvar)),
error = function(e) {
pram <- as.character(substitute(yvar))
stop(sprintf("Column %s is unknown", pram))
}, finally = NULL)
method <- match.arg(method)
imputate_na_impl(.data, vars, target, method, seed, print_flag, no_attrs)
}
#' @import tibble
#' @import dplyr
#' @importFrom stats predict
#' @importFrom methods is
imputate_na_impl <- function(df, xvar, yvar, method, seed = NULL,
print_flag = TRUE, no_attrs = FALSE) {
type <- ""
if (is(pull(df, xvar))[1] %in% c("integer", "numeric")) {
type <- "numerical"
} else if (is(pull(df, xvar))[1] %in% c("factor", "ordered")) {
if (method %in% c("mean", "median", "knn")) {
stop(sprintf("Categorical variable(%s) not support %s method",
xvar, method))
}
type <- "categorical"
}
data <- pull(df, xvar)
na_pos <- which(is.na(data))
na_flag <- length(na_pos) > 0
if (!na_flag) {
warning(sprintf("There are no missing values in %s.", xvar))
}
get_mean <- function() {
ifelse(is.na(data), mean(data, na.rm = TRUE), data)
}
get_median <- function() {
ifelse(is.na(data), median(data, na.rm = TRUE), data)
}
get_mode <- function() {
tab <- table(data)
if (length(tab) == 0) {
return(rep(NA, length(data)))
}
if (type == "numerical")
mode_value <- as.numeric(names(tab)[which.max(tab)])
else if (type == "categorical") {
mode_value <- levels(data)[which.max(tab)]
}
data[is.na(data)] <- mode_value
data
}
get_knn <- function(x, y) {
complete_order <- names(sort(apply(df[, setdiff(names(df), y)], 2,
function(x) sum(complete.cases(x)) / length(x))))
complete_cnt <- length(which(complete.cases(df)))
while(complete_cnt <= 10) {
complete_order <- complete_order[-1]
complete_cnt <- length(which(complete.cases(df[, complete_order])))
}
if (!x %in% complete_order) {
return(rep(NA, length(data)))
}
impute <- imputation_knn(df[, complete_order])
pred <- impute[, x]
ifelse(is.na(data), pred, data)
}
get_rpart <- function(x, y) {
if (type == "numerical") {
method <- "anova"
pred_type <- "vector"
} else if (type == "categorical") {
method <- "class"
pred_type <- "class"
}
complete_flag <- apply(df, 2, function(x) sum(complete.cases(x)) != 0)
complete_flag <- names(complete_flag[complete_flag])
if (!x %in% complete_flag) {
return(rep(NA, length(data)))
}
if (requireNamespace("rpart", quietly = TRUE)) {
model <- rpart::rpart(sprintf("`%s` ~ .", x),
data = df[!is.na(pull(df, x)), setdiff(intersect(names(df), complete_flag), y)],
method = method, na.action = na.omit)
} else {
warning("Package 'rpart' needed for this function to work. Please install it.",
call. = FALSE)
return(NULL)
}
pred <- predict(model, df[is.na(pull(df, x)), !names(df) %in% y],
type = pred_type)
data[is.na(data)] <- pred
data
}
get_mice <- function(x, y, seed = NULL, print_flag = TRUE) {
if (is.null(seed))
seed <<- sample(seq(1e5), size = 1)
if (!na_flag) {
data <- pull(df, x)
} else {
suppressWarnings(RNGversion("3.5.0"))
set.seed(seed = seed)
if (requireNamespace("mice", quietly = TRUE)) {
model <- mice::mice(df[, !names(df) %in% y], method = "rf", printFlag = print_flag)
} else {
warning("Package 'mice' needed for this function to work. Please install it.",
call. = FALSE)
return(NULL)
}
if (all(is.na(model$imp[[x]]))) {
return(rep(NA, length(data)))
}
if (type == "numerical") {
pred <- apply(model$imp[[x]], 1, mean)
data[is.na(data)] <- pred
} else if (type == "categorical") {
pred <- apply(model$imp[[x]], 1,
function(x) unique(x)[which.max(table(x))])
data[is.na(data)] <- pred
}
}
data
}
if (method == "mean")
result <- get_mean()
else if (method == "median")
result <- get_median()
else if (method == "mode")
result <- get_mode()
else if (method == "knn")
result <- get_knn(xvar, yvar)
else if (method == "rpart")
result <- get_rpart(xvar, yvar)
else if (method == "mice")
result <- get_mice(xvar, yvar, seed, print_flag)
if (!no_attrs) {
attr(result, "var_type") <- type
attr(result, "method") <- method
attr(result, "na_pos") <- na_pos
attr(result, "seed") <- seed
attr(result, "type") <- "missing values"
if (all(is.na(result))) {
msg <- "All values returned as NA. The data is not good enough for a imputation."
warning(msg)
attr(result, "message") <- msg
attr(result, "success") <- FALSE
} else {
attr(result, "message") <- "complete imputation"
attr(result, "success") <- TRUE
}
class(result) <- append("imputation", class(result))
}
result
}
#' Impute Outliers
#'
#' @description
#' Outliers are imputed with some representative values and statistical methods.
#'
#' @details
#' imputate_outlier() creates an imputation class.
#' The `imputation` class includes missing value position, imputed value,
#' and method of missing value imputation, etc.
#' The `imputation` class compares the imputed value with the original value
#' to help determine whether the imputed value is used in the analysis.
#'
#' See vignette("transformation") for an introduction to these concepts.
#'
#' @param .data a data.frame or a \code{\link{tbl_df}}.
#' @param xvar variable name to replace missing value.
#' @param method method of missing values imputation.
#' @param cap_ntiles numeric. Only used when method is "capping".
#' Specifies the value of percentiles replaced by the values of lower outliers
#' and upper outliers. The default is c(0.05, 0.95).
#' @param no_attrs logical. If TRUE, return numerical variable or categorical variable.
#' else If FALSE, imputation class.
#' @return An object of imputation class. or numerical variable.
#' if no_attrs is FALSE then return imputation class, else no_attrs is TRUE then return
#' numerical vector.
#' Attributes of imputation class is as follows.
#' \itemize{
#' \item method : method of missing value imputation.
#' \itemize{
#' \item predictor is numerical variable
#' \itemize{
#' \item "mean" : arithmetic mean
#' \item "median" : median
#' \item "mode" : mode
#' \item "capping" : Impute the upper outliers with 95 percentile,
#' and Impute the lower outliers with 5 percentile.
#' \itemize{
#' \item You can change this criterion with the cap_ntiles argument.
#' }
#' }
#' }
#' \item outlier_pos : position of outliers in predictor.
#' \item outliers : outliers. outliers corresponding to outlier_pos.
#' \item type : "outliers". type of imputation.
#' }
#' @seealso \code{\link{imputate_na}}.
#' @examples
#' \donttest{
#' # Replace the outliers of the sodium variable with median.
#' imputate_outlier(heartfailure, sodium, method = "median")
#'
#' # Replace the outliers of the sodium variable with capping.
#' imputate_outlier(heartfailure, sodium, method = "capping")
#' imputate_outlier(heartfailure, sodium, method = "capping",
#' cap_ntiles = c(0.1, 0.9))
#'
#' ## using dplyr -------------------------------------
#' library(dplyr)
#'
#' # The mean before and after the imputation of the sodium variable
#' heartfailure %>%
#' mutate(sodium_imp = imputate_outlier(heartfailure, sodium,
#' method = "capping", no_attrs = TRUE)) %>%
#' group_by(death_event) %>%
#' summarise(orig = mean(sodium, na.rm = TRUE),
#' imputation = mean(sodium_imp, na.rm = TRUE))
#'
#' # If the variable of interest is a numerical variables
#' sodium <- imputate_outlier(heartfailure, sodium)
#' sodium
#' summary(sodium)
#'
#' plot(sodium)
#' }
#'
#' @export
imputate_outlier <- function(.data, xvar, method, no_attrs, cap_ntiles) {
UseMethod("imputate_outlier")
}
#' @method imputate_outlier data.frame
#' @importFrom tidyselect vars_select
#' @importFrom rlang enquo
#' @export
imputate_outlier.data.frame <- function(.data, xvar,
method = c("capping", "mean", "median", "mode"), no_attrs = FALSE,
cap_ntiles = c(0.05, 0.95)) {
tryCatch(vars <- tidyselect::vars_select(names(.data), !! rlang::enquo(xvar)),
error = function(e) {
pram <- as.character(substitute(xvar))
stop(sprintf("Column %s is unknown", pram))
}, finally = NULL)
method <- match.arg(method)
imputate_outlier_impl(.data, vars, method, no_attrs, cap_ntiles)
}
#' @import dplyr
#' @importFrom methods is
#' @importFrom grDevices boxplot.stats
imputate_outlier_impl <- function(df, xvar, method, no_attrs = FALSE,
cap_ntiles = c(0.05, 0.95)) {
if (!is(pull(df, xvar))[1] %in% c("integer", "numeric")) {
stop(sprintf("Categorical variable(%s) not support imputate_outlier()",
xvar))
}
if (method %in% "capping" & length(cap_ntiles) != 2) {
stop("The cap_ntiles argument must be a numeric vector of length 2.")
}
data <- pull(df, xvar)
outliers <- boxplot.stats(data)$out
outlier_pos <- which(data %in% outliers)
outliers <- data[outlier_pos]
outlier_flag <- length(outlier_pos) > 0
get_mean <- function(x) {
data[outlier_pos] <- mean(data, na.rm = TRUE)
data
}
get_median <- function() {
data[outlier_pos] <- median(data, na.rm = TRUE)
data
}
get_mode <- function() {
tab <- table(data)
mode_value <- as.numeric(names(tab)[which.max(tab)])
data[outlier_pos] <- mode_value
data
}
get_capping <- function(cap_ntiles = c(0.05, 0.95)) {
hinges <- quantile(data, probs = c(0.25, 0.75), na.rm = TRUE)
caps <- quantile(data, probs = cap_ntiles, na.rm = TRUE)
whisker <- 1.5 * diff(hinges)
data[data < (hinges[1] - whisker)] <- caps[1]
data[data > (hinges[2] + whisker)] <- caps[2]
data
}
if (method == "mean")
result <- get_mean()
else if (method == "median")
result <- get_median()
else if (method == "mode")
result <- get_mode()
else if (method == "capping")
result <- get_capping(cap_ntiles)
if (!no_attrs) {
attr(result, "method") <- method
attr(result, "var_type") <- "numerical"
attr(result, "outlier_pos") <- outlier_pos
attr(result, "outliers") <- outliers
attr(result, "type") <- "outliers"
if (!outlier_flag) {
msg <- sprintf("There are no outliers in %s.", xvar)
warning(msg)
attr(result, "message") <- msg
attr(result, "success") <- FALSE
} else {
attr(result, "message") <- "complete imputation"
attr(result, "success") <- TRUE
}
class(result) <- append("imputation", class(result))
}
result
}
#' Summarizing imputation information
#'
#' @description print and summary method for "imputation" class.
#' @param object an object of class "imputation", usually, a result of a call to imputate_na() or
#' imputate_outlier().
#' @param ... further arguments passed to or from other methods.
#' @details
#' summary.imputation() tries to be smart about formatting two kinds of imputation.
#'
#' @seealso \code{\link{imputate_na}}, \code{\link{imputate_outlier}}, \code{\link{summary.imputation}}.
#' @examples
#' # Generate data for the example
#' heartfailure2 <- heartfailure
#' heartfailure2[sample(seq(NROW(heartfailure2)), 20), "platelets"] <- NA
#' heartfailure2[sample(seq(NROW(heartfailure2)), 5), "smoking"] <- NA
#'
#' # Impute missing values -----------------------------
#' # If the variable of interest is a numerical variables
#' platelets <- imputate_na(heartfailure2, platelets, yvar = death_event, method = "rpart")
#' summary(platelets)
#'
#' # If the variable of interest is a categorical variables
#' smoking <- imputate_na(heartfailure2, smoking, yvar = death_event, method = "rpart")
#' summary(smoking)
#'
#' # Impute outliers ----------------------------------
#' # If the variable of interest is a numerical variable
#' platelets <- imputate_outlier(heartfailure2, platelets, method = "capping")
#' summary(platelets)
#' @method summary imputation
#' @importFrom tidyr gather
#' @export
summary.imputation <- function(object, ...) {
success <- attr(object, "success")
if (!success) {
message("imputation object isn't success.")
return()
}
type <- attr(object, "type")
method <- attr(object, "method")
var_type <- attr(object, "var_type")
original <- object
if (type == "missing values") {
na_pos <- attr(object, "na_pos")
seed <- attr(object, "seed")
original[na_pos] <- NA
} else if (type == "outliers") {
outlier_pos <- attr(object, "outlier_pos")
outliers <- attr(object, "outliers")
original[outlier_pos] <- outliers
}
if (var_type == "numerical") {
original <- as.numeric(original)
object <- as.numeric(object)
} else if (var_type == "categorical") {
original <- factor(original)
object <- factor(object)
}
dframe <- data.frame(original = original,
imputation = object) %>%
tidyr::gather()
if (var_type == "numerical") {
smmry <- dframe %>%
group_by(key) %>%
describe("value") %>%
select(which(!names(.) %in% c("variable", "key"))) %>%
t
smmry <- smmry[, 2:1]
colnames(smmry) <- c("Original", "Imputation")
} else if (var_type == "categorical") {
tab_freq <- xtabs(~ value + key, dframe, addNA = TRUE)
tab_relat <- round(prop.table(tab_freq, 2) * 100, 2)
smmry <- cbind(tab_freq, tab_relat)
smmry <- smmry[, c(2, 1, 4, 3)]
colnames(smmry) <- c("original", "imputation",
"original_percent", "imputation_percent")
}
if (method %in% c("mean", "median", "mode", "capping")) {
cat(sprintf("Impute %s with %s\n\n", type, method))
} else if (method %in% c("knn", "rpart", "mice")) {
if (method == "knn") {
met <- "K-Nearest Neighbors"
met <- sprintf("%s\n - method : knn", met)
} else if (method == "rpart") {
met <- "Recursive Partitioning and Regression Trees"
met <- sprintf("%s\n - method : rpart", met)
} else if (method == "mice") {
met <- "Multivariate Imputation by Chained Equations"
met <- sprintf("%s\n - method : mice", met)
met <- sprintf("%s\n - random seed : %s", met, seed)
}
cat(sprintf("* Impute %s based on %s\n\n", type, met))
}
cat("* Information of Imputation (before vs after)\n")
print(smmry)
invisible(smmry)
}
#' Visualize Information for an "imputation" Object
#'
#' @description
#' Visualize two kinds of plot by attribute of `imputation` class.
#' The imputation of a numerical variable is a density plot,
#' and the imputation of a categorical variable is a bar plot.
#'
#' @details The base_family is selected from "Roboto Condensed", "Liberation Sans Narrow",
#' "NanumSquare", "Noto Sans Korean". If you want to use a different font,
#' use it after loading the Google font with import_google_font().
#'
#' @param x an object of class "imputation", usually, a result of a call to imputate_na()
#' or imputate_outlier().
#' @param typographic logical. Whether to apply focuses on typographic elements to ggplot2 visualization.
#' The default is TRUE. if TRUE provides a base theme that focuses on typographic elements using hrbrthemes package.
#' @param base_family character. The name of the base font family to use
#' for the visualization. If not specified, the font defined in dlookr is applied. (See details)
#' @param ... arguments to be passed to methods, such as graphical parameters (see par).
#' only applies when the model argument is TRUE, and is used for ... of the plot.lm() function.
#' @return A ggplot2 object.
#' @seealso \code{\link{imputate_na}}, \code{\link{imputate_outlier}}, \code{\link{summary.imputation}}.
#' @examples
#' # Generate data for the example
#' heartfailure2 <- heartfailure
#' heartfailure2[sample(seq(NROW(heartfailure2)), 20), "platelets"] <- NA
#' heartfailure2[sample(seq(NROW(heartfailure2)), 5), "smoking"] <- NA
#'
#' # Impute missing values -----------------------------
#' # If the variable of interest is a numerical variables
#' platelets <- imputate_na(heartfailure2, platelets, yvar = death_event, method = "rpart")
#' plot(platelets)
#'
#' # If the variable of interest is a categorical variables
#' smoking <- imputate_na(heartfailure2, smoking, yvar = death_event, method = "rpart")
#' plot(smoking)
#'
#' # Impute outliers ----------------------------------
#' # If the variable of interest is a numerical variable
#' platelets <- imputate_outlier(heartfailure2, platelets, method = "capping")
#' plot(platelets)
#' @method plot imputation
#' @import ggplot2
#' @import hrbrthemes
#' @importFrom tidyr gather
#' @export
plot.imputation <- function(x, typographic = TRUE, base_family = NULL, ...) {
type <- attr(x, "type")
var_type <- attr(x, "var_type")
method <- attr(x, "method")
original <- x
if (type == "missing values") {
na_pos <- attr(x, "na_pos")
seed <- attr(x, "seed")
original[na_pos] <- NA
} else if (type == "outliers") {
outlier_pos <- attr(x, "outlier_pos")
outliers <- attr(x, "outliers")
original[outlier_pos] <- outliers
}
if (method == "mice") {
method <- sprintf("%s (seed = %s)", method, seed)
}
if (var_type == "numerical") {
suppressWarnings({p <- data.frame(original = original, imputation = x) %>%
tidyr::gather() %>%
ggplot(aes(x = value, color = key)) +
geom_density(na.rm = TRUE) +
labs(title = sprintf("imputation method : %s", method))}) +
theme_grey(base_family = base_family)
if (typographic) {
p <- p +
theme_typographic(base_family) +
scale_color_ipsum() +
theme(
axis.title.x = element_text(size = 13),
axis.title.y = element_text(size = 13)
)
}
suppressWarnings(p)
} else if (var_type == "categorical") {
suppressWarnings({p <- data.frame(original = original, imputation = x) %>%
tidyr::gather() %>%
ggplot(aes(x = value, fill = key)) +
geom_bar(position = "dodge") +
labs(title = sprintf("imputation method : %s", method),
x = "level", y = "frequency")}) +
theme_grey(base_family = base_family)
if (typographic) {
p <- p +
theme_typographic(base_family) +
scale_fill_ipsum() +
theme(
axis.title.x = element_text(size = 13),
axis.title.y = element_text(size = 13)
)
}
suppressWarnings(p)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.