#' Missing Imputation
#'
#' This function completes regression-based missing imputation. The function
#' accepts a data frame containing variables with missing values. The data
#' frame can contain variables with no missing values if you would not like to
#' include them in replacing the values with missing variables. However the
#' data frame must have at least one column with at least one missing value.
#' The function will look for strictly positive values and percentage values so
#' those same patterns hold with replaced values. If you want to override this
#' you need to include at least one observation where that logic is broken. In
#' addition the function can also use a percentage of values to build the
#' predictive models using \code{sample_frac} option. This is to speed up the
#' missing imputations however not all fractions are faster than just using all
#' observations. I would recomend using a value less than .5.
#' @param missing_df A data frame containing at least some columns with missing values
#' @param num_iter Number of iterations to perform
#' @param progress A logical indicator to print the number of completed interactions
#' @param sample_frac A number between 0 and 1 indicating the fraction of observations to use to build the predictive models. Default is 1 which will use all observations.
#' @keywords eda
#' @return A list containing
#'
#' \code{complete_obs} a data frame with the missing values replaced through regression imputation
#'
#' \code{change} The differences between each iteration for each variable
#' @examples
#' iris_df <- iris
#' set.seed(123456)
#' na1 <- runif(nrow(iris_df)) < .25
#' na2 <- runif(nrow(iris_df)) < .33
#' iris_df[["Species"]][na1] <- NA
#' iris_df[["Sepal.Length"]][na2] <- NA
#' iris_complete <- MissingImputation(iris_df, num_iter = 5)
#' difs <- data.frame(Species.Orig = as.character(iris[["Species"]][na1]), Species.Replace = iris_complete$complete_obs[["Species"]][na1], Sepal.Orig = iris[["Sepal.Length"]][na1], Sepal.Replace = iris_complete$complete_obs[["Sepal.Length"]][na1], stringsAsFactors = F)
#' \dontrun{View(difs[[1]], difs[[2]])}
#' \dontrun{ggplot2::qplot(x = Sepal.Orig, y = Sepal.Replace, data = difs)}
MissingImputation <- function(missing_df, num_iter = 10, progress = F, sample_frac = 1){
if(!("data.frame" %in% class(missing_df))) stop("missing_df must be a data frame")
if(num_iter < 1) stop("number of iterations must be strictly positive")
missing_log <- unlist(lapply(missing_df, function(x) any(is.na(x))))
colorder <- names(missing_df)
if(sum(missing_log) == 0) stop("data frame must contain some missing values")
if(all(missing_log)) complete_df <- NULL
if(!all(missing_log)) complete_df <- missing_df[, !missing_log]
missing_df <- missing_df[, missing_log]
factrs <- unlist(lapply(missing_df, is.factor))
if(any(factrs)) missing_df[, factrs] <- lapply(missing_df[, factrs, drop = F], as.character)
positive_values <- unlist(lapply(missing_df, function(x) ifelse(class(x) == "numeric" | class(x) == "integer", all(x[!is.na(x)] >= 0), F)))
pct_values <- unlist(lapply(missing_df, function(x) ifelse(class(x) == "numeric" | class(x) == "integer", all(x[!is.na(x)] <= 1 & x[!is.na(x)] >= 0), F)))
replace_df <- as.data.frame(lapply(missing_df, function(x){
na_log <- is.na(x)
x_na <- sample(x[!na_log], size = sum(na_log), replace = T)
x[na_log] <- x_na
return(x)
}), stringsAsFactors = F)
cols <- ncol(missing_df)
change <- matrix(0, nrow = num_iter, ncol = cols)
colnames(change) <- names(missing_df)
rownames(change) <- 1:num_iter
for(i in seq_len(num_iter)){
if(progress) print(i)
for(j in seq_len(cols)){
na_log <- is.na(missing_df[[j]])
n_unique <- length(unique(missing_df[[j]][!na_log]))
if(is.null(complete_df)) reg_data <- replace_df
if(!is.null(complete_df)) reg_data <- cbind(replace_df, complete_df)
if(class(missing_df[[j]]) == "character"){
reg_data[[names(missing_df)[j]]] <- factor(reg_data[[names(missing_df)[j]]])
level <- levels(reg_data[[names(missing_df)[j]]])
if(n_unique == 1){
replace_df[[j]][na_log] <- unique(missing_df[[j]][!na_log])
} else if(n_unique == 2){
if(sample_frac == 1){
missing_glm <- glm(formula = paste(names(missing_df)[j], "~ ."), data = reg_data, family = "binomial")
preds <- vapply(predict(missing_glm, type = "response"), function(x) sample(level, size = 1, prob = c(1 - x, x)), character(1))
} else {
samp <- runif(nrow(reg_data)) <= sample_frac
reg_matr <- data.frame(model.matrix(formula(paste0(names(missing_df)[j], " ~ .-1")), data = reg_data), stringsAsFactors = F)
reg_matr$Y <- reg_data[[names(missing_df)[j]]]
missing_glm <- glm(Y ~ ., data = reg_matr[samp, ], family = "binomial")
preds <- suppressWarnings(vapply(predict(missing_glm, newdata = reg_matr, type = "response"), function(x) sample(level, size = 1, prob = c(1 - x, x)), character(1)))
}
change[i, j] <- sum(replace_df[[j]][na_log] != preds[na_log])
replace_df[[j]][na_log] <- preds[na_log]
} else {
pred_matr <- matrix(0, nrow = nrow(reg_data), ncol = length(level) - 1)
if(sample_frac < 1){
reg_matr <- data.frame(model.matrix(formula(paste0(names(missing_df)[j], " ~ .-1")), data = reg_data), stringsAsFactors = F)
samp <- runif(nrow(reg_data)) <= sample_frac
}
for(l in seq_len(length(level) - 1)){
if(sample_frac == 1){
reg_data$Y <- (reg_data[[names(missing_df)[j]]] == level[l])*1
factor_glm <- glm(paste0("Y ~ .-", names(missing_df)[j]), data = reg_data, family = "binomial")
pred_matr[, l] <- predict(factor_glm)
} else {
reg_matr$Y <- (reg_data[[names(missing_df)[j]]] == level[l])*1
factor_glm <- glm(Y ~ ., data = reg_matr[samp, ], family = "binomial")
pred_matr[, l] <- suppressWarnings(predict(factor_glm, newdata = reg_matr))
}
}
probs <- cbind(exp(pred_matr), 1) / (1 + rowSums(exp(pred_matr)))
probs[is.na(probs)] <- 1
preds <- apply(probs, 1, function(x) sample(level, size = 1, prob = x/sum(x)))
change[i, j] <- sum(replace_df[[j]][na_log] != preds[na_log])
replace_df[[j]][na_log] <- preds[na_log]
}
} else {
if(n_unique == 1){
replace_df[[j]][na_log] <- unique(missing_df[[j]][!na_log])
} else {
if(sample_frac == 1){
missing_lm <- lm(formula = paste(names(missing_df)[j], "~ ."), data = reg_data)
preds <- rnorm(length(na_log), mean = predict(missing_lm), sd = summary(missing_lm)$sigma)
} else {
samp <- runif(nrow(reg_data)) <= sample_frac
reg_matr <- data.frame(model.matrix(formula(paste0(names(missing_df)[j], " ~ .-1")), data = reg_data), stringsAsFactors = F)
reg_matr$Y <- reg_data[[names(missing_df)[j]]]
missing_lm <- lm(Y ~ ., data = reg_matr[samp, ])
preds <- suppressWarnings(rnorm(length(na_log), mean = predict(missing_lm), sd = summary(missing_lm)$sigma))
}
if(positive_values[j]) preds <- ifelse(preds < 0, 0, preds)
if(pct_values[j]) preds <- ifelse(preds > 1, 1, preds)
change[i, j] <- mean(abs(replace_df[[j]][na_log] - preds[na_log]))
replace_df[[j]][na_log] <- preds[na_log]
}
}
}
}
if(is.null(complete_df)) return(list(complete_obs = replace_df, change_amounts = change))
if(!is.null(complete_df)) return(list(complete_obs = cbind(replace_df, complete_df)[, colorder], change_amounts = change))
}
# load("~/Documents/Data/Magic/sth_attn_1415.rdata")
# complete_df <- sth_attn_1415[, !unlist(lapply(sth_attn_1415, function(x) any(is.na(x))))]
# missing_df <- sth_attn_1415[, unlist(lapply(sth_attn_1415, function(x) any(is.na(x))))]
# complete_df <- complete_df[, c("Section", "row_name", "seat_number", "Seats", "full_price", "ticket_type", "ticket_status", "sold_status", "plan_event_name", "Price_Name")]
# missing_df <- missing_df[, -c(1, 5, 6, 8:11, 13, 15:19, 22:24, 30)]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.