# dens_comp
# @description There are density comparing functions for some of the imputation
# method, but here we try to define the function that could be used for every
# method.
# @param df_comp One dataset.
# @param df_imp Another dataset with the same columns as \code{df_comp}.
# @return The comparison of the density distribution for each column.
# dens_comp <- function(df_comp, df_imp) {
# ls_col_name <- colnames(df_comp)
# ls_p <- list()
# for (ycol in ls_col_name) {
# dat <- data.frame(
# "y" = c(df_comp[[ycol]], df_imp[[ycol]]),
# "lines" = rep(c("complete", "imputed"), each = nrow(df_comp))
# )
# p <- ggplot2::ggplot(dat, ggplot2::aes(x = y, fill = lines)) +
# ggplot2::geom_density(alpha = 0.3) +
# ggplot2::labs(x = ycol)
# show(p)
# }
# }
#' List of MSE
#' @description \code{ls_MSE} is a function that returns a list of MSE
#' corresponding to the given list of imputed datasets.
#' \code{resample_method} is needed because with 'bootstrap' method, we could
#' have repeated lines in the imputed datasets, and with both 'jackknife' and
#' 'bootstrap', the imputed datasets could not cover all the lines.
#' With the purpose of giving every variable the same weight, we scale each
#' variable with the mean and variance calculated from the complete dataset.
#'
#' If the complete and imputed datasets are mix-typed, then only the numerical
#' parts are taken into account.
#' @param df_comp The original complete dataset.
#' @param ls_df_imp List of imputed dataset.
#' @param mask Mask of missingness (1 means missing value and 0 means
#' observed value)
#' @param resample_method Default value is 'bootstrap', could also be
#' 'jackknife' or 'none'.
#' @param col_num_comp Indices of numerical columns in the complete dataset
#' @export
#' @return \code{list_MSE} List of MSE corresponding to the given list of
#' imputed datasets.
#' @return \code{Mean_MSE} Mean value of MSE.
#' @return \code{Variance_MSE} Variance of MSE.
#' @return \code{list_MSE_scale} List of scaled MSE corresponding to the given
#' list of imputed datasets. Before performing the calculation of MSE,
#' the imputed data set and complete dataset are both scaled with Min-Max scale
#' using the parameter from complete dataset.
#' @return \code{Mean_MSE_scale} Mean value of scaled MSE.
#' @return \code{Variance_MSE} Variance of scaled MSE.
ls_MSE <- function(df_comp,
ls_df_imp,
mask,
col_num_comp,
resample_method = "bootstrap") {
resample_method <- match.arg(resample_method, c(
"bootstrap",
"jackknife", "none"
))
ls_mse_result <- c()
ls_mse_result_scale <- c()
mask_num <- mask[, col_num_comp] * 1
df_comp_num <- df_comp[, col_num_comp]
col_name_num <- colnames(df_comp_num)
n_sample <- length(ls_df_imp)
i <- 1
for (df_imp in ls_df_imp) {
df_imp_num <- df_imp[col_name_num]
if (resample_method == "bootstrap") {
df_imp_num[["index"]] <- as.numeric(row.names(df_imp_num))
df_imp_num$index <- floor(df_imp_num$index)
df_num <- stats::aggregate(. ~ index, data = df_imp_num[
c("index", col_name_num)
], mean)
df_imp_i <- df_num[col_name_num]
df_comp_i <- df_comp_num[df_num$index, ]
mask_num_i <- mask_num[df_num$index, ]
} else if (resample_method == "jackknife") {
df_imp_num[["index"]] <- as.numeric(row.names(df_imp_num))
df_imp_i <- df_imp_num[col_name_num]
df_comp_i <- df_comp_num[df_imp_num$index, ]
mask_num_i <- mask_num[df_imp_num$index, ]
} else {
df_imp_i <- df_imp_num
df_comp_i <- df_comp_num
mask_num_i <- mask_num
}
mse_result <- sum((df_comp_i - df_imp_i)^2) / sum(mask_num_i)
ls_mse_result[i] <- mse_result
# MinMax Scale each variable based on estimated parameters from
# corresponding complete dataset
min_comp <- apply(df_comp_i, 2, min)
max_min_comp <- apply(df_comp_i, 2, function(x) {
max(x) - min(x)
})
df_comp_i_scale <- as.matrix(df_comp_i - min_comp) / t(
matrix(rep(max_min_comp, nrow(df_comp_i)), nrow = length(max_min_comp))
)
df_imp_i_scale <- as.matrix(df_imp_i - min_comp) / t(
matrix(rep(max_min_comp, nrow(df_comp_i)), nrow = length(max_min_comp))
)
mse_result_scale <- sum((df_comp_i_scale - df_imp_i_scale)^2) / sum(
mask_num_i
)
ls_mse_result_scale[i] <- mse_result_scale
i <- i + 1
}
return(list(
list_MSE = ls_mse_result,
Mean_MSE = mean(ls_mse_result),
Variance_MSE = var(ls_mse_result),
list_MSE_scale = ls_mse_result_scale,
Mean_MSE_scale = mean(ls_mse_result_scale),
Variance_MSE_scale = var(ls_mse_result_scale)
))
}
# F1
# Input: Original dataset, list of imputed dataset, mask of missingness, column
# index for categorical columns
# Output: A list of F1-scores for each imputation result, a average F1-score,
# variance of F1-score
#' List of MSE
#' @description \code{ls_F1} is a function that returns a list of F1-Score
#' corresponding to the given list of imputed datasets.
#' \code{resample_method} is needed because with 'bootstrap' method, we could
#' have repeated lines in the imputed datasets,
#' and with both 'jackknife' and 'bootstrap', the imputed datasets could not
#' cover all the lines.
#' @param df_comp The original complete dataset.
#' @param ls_df_imp List of imputed dataset.
#' @param mask Mask of missingness (1 means missing value and 0 means observed
#' value).
#' @param col_cat_comp Indices of categorical columns in the complete dataset.
#' @param col_cat_imp Indices of categorical columns in the imputed dataset.
#' @param resample_method Default value is 'bootstrap', could also be
#' 'jackknife' or 'none'.
#' @param combine_method When \code{resample_method} = 'bootstrap',
#' \code{combine_method} could be 'factor' or 'onehot'.
#' When \code{method} = 'onehot', \code{ls_F1} takes the average of the one-hot
#' probability vector for each observation, then choose the position of maximum
#' probability as the predicted category. When \code{method} = 'factor', or
#' each observation, \code{ls_F1} chooses the mode value over the imputed
#' dataframes as the predicted category.
#' @param dict_cat The dictionary of categorical columns names if "onehot"
#' method is applied. For example, it could be list("Y7"=c("Y7_1","Y7_2"),
#' "Y8"=c("Y8_1","Y8_2","Y8_3")).
#' @export
#' @return \code{list_F1} List of F1 corresponding to the given list of
#' imputed datasets.
#' @return \code{Mean_F1} Mean value of F1.
#' @return \code{Variance_F1} Variance of F1.
ls_F1 <- function(df_comp,
ls_df_imp,
mask,
col_cat_comp,
col_cat_imp,
resample_method = "bootstrap",
combine_method = "onehot",
dict_cat = NULL) {
resample_method <- match.arg(resample_method, c(
"bootstrap",
"jackknife", "none"
))
combine_method <- match.arg(combine_method, c("onehot", "factor"))
# if (resample_method == "jackknife" && is.null(df_imp_full)) {
# stop("With jackknife resampling method, df_imp_full is required.\n")
# }
if (resample_method == "jackknife" && combine_method == "factor") {
stop("With jackknife resampling method, combine_method could only be
'onehot'.\n")
}
if (resample_method != "none" && combine_method == "onehot" && is.null(
dict_cat
)) {
stop("With onehot combining method, dict_cat is needed.\n")
}
dict_lev <- dict_level(df_comp, col_cat_comp)
ls_f1_result <- c()
i <- 1
n_sample <- length(ls_df_imp)
for (df_imp in ls_df_imp) {
# Take only the categorical part
df_imp_cat <- df_imp[, col_cat_imp]
col_name_cat <- colnames(df_imp_cat)
if (resample_method == "bootstrap") {
df_imp_cat[["index"]] <- as.numeric(row.names(df_imp_cat))
df_imp_cat$index <- floor(df_imp_cat$index)
if (combine_method == "onehot") { # df_imp_cat in form of onehot
# take average of the onehot probabilities for the doublons
df_cat <- stats::aggregate(. ~ index, data = df_imp_cat[
c("index", col_name_cat)
], mean)
# convert onehot to factor form
names_cat <- names(dict_cat)
for (name in names_cat) {
df_cat[[name]] <- apply(
df_cat[dict_cat[[name]]], 1,
which_max_cat, name, dict_cat
)
df_cat[[name]] <- unlist(df_cat[[name]])
df_cat[[name]] <- apply(as.array(df_cat[[name]]), 1, function(x) {
unlist(strsplit(x, "_"))[-1]
})
df_cat[[name]] <- factor(df_cat[[name]])
}
df_imp_i <- df_cat[names_cat]
df_comp_i <- df_comp[df_cat$index, ][names_cat]
mask_cat_i <- mask[df_cat$index, ][names_cat] * 1
} else { # df_imp_cat in form of factor
df_cat <- df_imp_cat[c("index", col_name_cat)] %>%
dplyr::group_by(.data$index) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(col_name_cat), Mode_cat))
names_cat <- names(dict_lev)
for (name in names_cat) {
df_cat[[name]] <- apply(as.array(df_cat[[name]]), 1, function(x) {
unlist(strsplit(x, "_"))[-1]
})
df_cat[[name]] <- factor(df_cat[[name]])
}
df_imp_i <- df_cat[col_name_cat]
df_comp_i <- df_comp[df_cat$index, col_cat_comp]
mask_cat_i <- mask[df_cat$index, col_cat_comp] * 1
}
} else if (resample_method == "jackknife") {
# convert onehot to factor form
df_imp_cat[["index"]] <- as.numeric(row.names(df_imp_cat))
names_cat <- names(dict_cat)
for (name in names_cat) {
df_imp_cat[[name]] <- apply(
df_imp_cat[dict_cat[[name]]], 1,
which_max_cat, name, dict_cat
)
df_imp_cat[[name]] <- unlist(df_imp_cat[[name]])
df_imp_cat[[name]] <- apply(
as.array(df_imp_cat[[name]]), 1,
function(x) {
unlist(strsplit(x, "_"))[-1]
}
)
df_imp_cat[[name]] <- factor(df_imp_cat[[name]])
}
df_imp_i <- df_imp_cat[names_cat]
df_comp_i <- df_comp[df_imp_cat$index, ][names_cat]
mask_cat_i <- mask[df_imp_cat$index, ][names_cat] * 1
} else {
df_imp_i <- df_imp_cat
df_comp_i <- df_comp[, col_cat_comp]
mask_cat_i <- data.frame(mask[, col_cat_comp] * 1)
}
# Change to one vector
ls_f1 <- c()
for (col in colnames(df_comp_i)) {
mask_concat <- as.vector(mask_cat_i[[col]])
df_comp_concat <- as.vector(df_comp_i[[col]])
df_comp_true <- df_comp_concat[mask_concat == 1]
df_imp_concat <- as.vector(df_imp_i[[col]])
df_imp_predict <- df_imp_concat[mask_concat == 1]
ls_f1 <- c(ls_f1, F1_Score_micro(df_comp_true, df_imp_predict))
}
# micro f1 score:
# https://sebastianraschka.com/faq/docs/multiclass-metric.html
ls_f1_result[i] <- mean(ls_f1)
i <- i + 1
}
return(list(
list_F1 = ls_f1_result,
Mean_F1 = mean(ls_f1_result),
Variance_F1 = var(ls_f1_result)
))
}
# Version jackknife estimate using estimation on the full incomplete dataset
# ls_MSE_bis <- function(df_comp,
# ls_df_imp,
# df_full,
# mask,
# col_num_comp,
# resample_method = "bootstrap") {
# resample_method <- match.arg(resample_method, c("bootstrap",
# "jackknife", "none"))
# ls_mse_result <- c()
# ls_mse_result_scale <- c()
# mask_num <- mask[, col_num_comp] * 1
# col_names <- colnames(df_comp)
# df_comp_num <- df_comp[, col_num_comp]
# col_name_num <- colnames(df_comp_num)
# df_full_num <- df_full[col_name_num]
# n_sample <- length(ls_df_imp)
# i <- 1
# for (df_imp in ls_df_imp) {
# df_imp_num <- df_imp[col_name_num]
# df_imp_num[["index"]] <- as.numeric(row.names(df_imp_num))
# if (resample_method == "bootstrap") {
# df_imp_num$index <- floor(df_imp_num$index)
# df_num <- stats::aggregate(. ~ index, data = df_imp_num[c("index",
# col_name_num)], mean)
# df_imp_i <- df_num[col_name_num]
# df_comp_i <- df_comp_num[df_num$index, ]
# mask_num_i <- mask_num[df_num$index, ]
# } else if (resample_method == "jackknife") {
# df_full_num$index <- as.numeric(row.names(df_full_num))
# df_imp_i <- n_sample * df_full_num[df_imp_num$index, ] - (
# n_sample - 1) * df_imp_num
# df_imp_i <- df_imp_i[col_name_num]
# df_comp_i <- df_comp_num[df_imp_num$index, ]
# mask_num_i <- mask_num[df_imp_num$index, ]
# } else {
# df_imp_i <- df_imp_num
# df_comp_i <- df_comp_num
# mask_num_i <- mask_num
# }
#
# mse_result <- sum((df_comp_i - df_imp_i)^2) / sum(mask_num_i)
# ls_mse_result[i] <- mse_result
#
# # MinMax Scale each variable based on estimated parameters from
# # corresponding complete dataset
# min_comp <- apply(df_comp_i, 2, min)
# max_min_comp <- apply(df_comp_i, 2, function(x) {
# max(x) - min(x)
# })
# df_comp_i_scale <- as.matrix(df_comp_i - min_comp) / t(
# matrix(rep(max_min_comp, nrow(df_comp_i)), nrow = length(max_min_comp)))
# df_imp_i_scale <- as.matrix(df_imp_i - min_comp) / t(
# matrix(rep(max_min_comp, nrow(df_comp_i)), nrow = length(max_min_comp)))
# mse_result_scale <- sum((df_comp_i_scale - df_imp_i_scale)^2) / sum
# (mask_num_i)
# ls_mse_result_scale[i] <- mse_result_scale
#
# i <- i + 1
# }
# return(list(
# list_MSE = ls_mse_result,
# Mean_MSE = mean(ls_mse_result),
# Variance_MSE = var(ls_mse_result),
# list_MSE_scale = ls_mse_result_scale,
# Mean_MSE_scale = mean(ls_mse_result_scale),
# Variance_MSE_scale = var(ls_mse_result_scale)
# ))
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.