#' bootsample: create several dataframes by bootstrap resampling
#'
#' @description
#' \code{bootsample} function generates several dataframes by drawing samples of
#' size n by simple random sampling with replacement.
#'
#' In average, if we want to cover all the n rows in the original dataframe,
#' the number of samples should be greater than log(n).
#'
#' In this function, to avoid that some rows are uncovered during resampling
#' process (even if the number of samples is smaller than log(n)),
#' the last bootstrap sample will make sure to cover all those uncovered rows.
#' @param df A complete or incomplete dataframe
#' @param num_sample Number of bootstrapped samples. We suggest that num_sample
#' > Log(n), where n is the number of rows.
#' @return A list of bootstrapped dataframes.
#' @export
#' @references Statistical Analysis with Missing Data, by Little and Rubin, 2002
#' @examples
#' n <- 10000
#' mu.X <- c(1, 2, 3)
#' Sigma.X <- matrix(c(9, 3, 2, 3, 4, 0, 2, 0, 1), nrow = 3)
#' X.complete.cont <- MASS::mvrnorm(n, mu.X, Sigma.X)
#' rs <- generate_miss(X.complete.cont, 0.5, mechanism = "MNAR2")
#' ls_boot <- bootsample(rs$X.incomp, 4)
bootsample <- function(df, num_sample) {
is_gdata_package_installed()
is_rlang_package_installed()
num_row <- nrow(df)
# if (num_sample < log(num_row)) {
# warning(
# "Warning: We suggest that the bootstrap number > log(number of rows),
# if not, all the rows may not be covered in the bootstrap samples.\n"
# )
# }
ls_df_new <- list()
i <- 1
ls_idx <- c()
while (i <= num_sample - 1) {
chosen_idx <- gdata::resample(c(1:num_row), num_row, replace = TRUE)
ls_idx <- c(ls_idx, chosen_idx)
ls_df_new[[i]] <- df[chosen_idx, ]
i <- i + 1
}
# last sample
ls_idx_miss <- c(1:num_row)
ls_idx_miss <- ls_idx_miss[!ls_idx_miss %in% unique(ls_idx)]
chosen_idx <- gdata::resample(c(1:num_row), num_row - length(ls_idx_miss),
replace = TRUE
)
chosen_idx <- c(chosen_idx, ls_idx_miss)
ls_df_new[[i]] <- df[chosen_idx, ]
return(ls_df_new)
}
#' combine_boot: combine imputed bootstrap datasets
#'
#' @description
#' \code{combine_boot} function combines several imputed bootstrapped dataframes
#' into the final imputed dataframe and provide the variance for each imputed
#' value.
#' @param ls_df A list of imputed bootstrapped dataframes.
#' @param col_con Continous columns index.
#' @param col_dis Discret columns index.
#' @param col_cat Categorical columns index.
#' @param num_row_origin Number of rows in the original incomplete dataframe
#' before bootstrapping.
#' @param method The encoded method of categorical columns in the imputed
#' dataframes.
#' This function is coded for both "onehot" and "factor" encoded situations.
#' When \code{method} = 'onehot', \code{combine_boot} averages the probability
#' vectors over the $B$ imputed datasets for the same observation,
#' then choose the position of maximum probability as the predicted category in
#' the final result.
#' When \code{method} = 'factor', for each observation, \code{combine_boot}
#' choose the mode value over the imputed dataframes as the predicted category.
#' @param dict_cat The dictionary of categorical columns names if "onehot"
#' method is applied.
#' For example, it could be list("Y7"=c("Y7_1","Y7_2"),
#' "Y8"=c("Y8_1","Y8_2","Y8_3")).
#' @param var_cat The method of variance calculation for the categorical
#' columns.
#' "unalike" will lead to the calculation of unalikeability, while "wilcox_va"
#' will lead to the calculation of Wilcox index: VarNC.
#' @return \code{df_result_disj} The final imputed dataframe with the
#' categorical columns in onehot form.
#' @return \code{df_result_var_disj} The variance matrix for the final
#' imputation dataframe with the categorical columns in onehot form.
#' @return \code{df_result} The final imputed dataframe with the categorical
#' columns in factor form.
#' @return \code{df_result_var} The variance matrix for the final imputation
#' dataframe with the categorical columns in factor form.
#' @export
#' @importFrom dplyr %>%
#' @importFrom rlang .data
#' @references Statistical Analysis with Missing Data, by Little and Rubin, 2002
combine_boot <- function(ls_df,
col_con,
col_dis = c(),
col_cat = c(),
num_row_origin,
method = "onehot",
dict_cat = NULL,
var_cat = "unalike") {
method <- match.arg(method, c("onehot", "factor"))
var_cat <- match.arg(var_cat, c("unalike", "wilcox_va"))
is_unalike <- (var_cat == "unalike")
is_onehot <- (method == "onehot")
if (is_onehot & is.null(dict_cat)) {
stop("dict_cat is needed when the method is onehot.")
}
ls_df_new <- list()
ls_col_name <- colnames(ls_df[[1]])
col_num <- c(col_con, col_dis)
exist_dis <- !all(c(0, col_dis) == c(0))
if (exist_dis) {
col_name_dis <- ls_col_name[col_dis]
}
exist_cat <- !all(c(0, col_cat) == c(0))
if (exist_cat) {
col_name_cat <- ls_col_name[col_cat]
if (is_onehot) {
col_name_cat <- ls_col_name[col_cat]
# We treat everything as numeric if the categorical variables are encoded
# by onehot probability
col_num <- c(col_num, col_cat)
}
if (is_unalike) {
is_uwo4419_package_installed()
} else {
is_qualvar_package_installed()
}
}
col_name_num <- ls_col_name[col_num]
# Deal with doublets in each imputed dataset
i <- 1
for (df in ls_df) {
df[["index"]] <- as.numeric(row.names(df))
df[["index"]] <- floor(df[["index"]])
df_num <- stats::aggregate(. ~ df[c("index", col_name_num)]$index,
data = df[c("index", col_name_num)], mean
)
df_num <- df_num[-c(1)]
if (exist_cat && !is_onehot) {
df_cat <- df[c("index", col_name_cat)] %>%
dplyr::group_by(.data$index) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(col_name_cat), Mode_cat))
ls_df_new[[i]] <- merge(df_num, df_cat, by = "index")
} else {
ls_df_new[[i]] <- df_num
}
# the expected number of rows is n(1-(n-1)^k/n^k) where k = n
# expectation of unique rows for sampling with replacement
# We need a variance matrix here to pass ?
# No, see https://stats.stackexchange.com/questions/399382/bootstrap-rubins-rules-and-uncertainty-of-sub-estimates
i <- i + 1
}
# Combine all imputed datasets together
df_new_merge <- Reduce(function(dtf1, dtf2) {
rbind(dtf1, dtf2)
}, ls_df_new)
# Add the combined result for categorical variables deducted from the onehot
# result
if (exist_cat && is_onehot) {
names_cat <- names(dict_cat)
for (name in names_cat) {
df_new_merge[[name]] <- apply(
df_new_merge[dict_cat[[name]]], 1,
which_max_cat, name, dict_cat
)
df_new_merge[[name]] <- unlist(df_new_merge[[name]])
df_new_merge[[name]] <- factor(df_new_merge[[name]])
}
}
# By Bootstrap combining rules
df_new_mean_num <- stats::aggregate(. ~ df_new_merge[c(
"index",
col_name_num
)]$index,
data = df_new_merge[c(
"index",
col_name_num
)],
mean
)
df_new_mean_num <- df_new_mean_num[-c(1)]
df_new_num_var <- stats::aggregate(. ~ df_new_merge[
c("index", col_name_num)
]$index, data = df_new_merge[c(col_name_num)], var)
colnames(df_new_num_var)[1] <- "index"
if (exist_dis) {
# for the discret variables
df_new_mean_num[col_name_dis] <- round(df_new_mean_num[col_name_dis])
}
if (exist_cat && !is_onehot) {
df_new_merge <- factor_encode(
df_new_merge,
which(colnames(
df_new_merge
) %in% col_name_cat)
)
df_new_mode_cat <- df_new_merge[c("index", col_name_cat)] %>%
dplyr::group_by(.data$index) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(col_name_cat), Mode_cat))
df_new <- merge(df_new_mean_num, df_new_mode_cat, by = "index")
df_new <- factor_encode(df_new, which(
colnames(df_new_merge) %in% col_name_cat
))
if (is_unalike) {
df_new_cat_var <- stats::aggregate(. ~ df_new_merge[
c("index", col_name_cat)
]$index,
data = df_new_merge[c("index", col_name_cat)], uwo4419::unalike
)
df_new_cat_var <- df_new_cat_var[-c(1)]
} else {
df_new_cat_var <- df_new_merge[c("index", col_name_cat)] %>%
dplyr::group_by(.data$index) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(col_name_cat), VA_fact))
}
df_new_var <- merge(df_new_num_var, df_new_cat_var, by = "index")
# We need to use unalikeability to mesure the uncertainty of the
# categorical variables
} else {
df_new <- df_new_mean_num
df_new_var <- df_new_num_var
}
# Add the combined result for categorical variables deducted from the
# one-hot result to df_new
if (is_onehot && exist_cat) {
for (name in names_cat) {
df_new[[name]] <- apply(
df_new[dict_cat[[name]]], 1,
which_max_cat, name, dict_cat
)
df_new[[name]] <- unlist(df_new[[name]])
}
if (is_unalike) {
df_new_cat_var <- stats::aggregate(. ~ df_new_merge[
c("index", names_cat)
]$index,
data = df_new_merge[c("index", names_cat)], uwo4419::unalike
)
df_new_cat_var <- df_new_cat_var[-c(1)]
} else {
df_new_cat_var <- df_new_merge[c("index", names_cat)] %>%
dplyr::group_by(.data$index) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(names_cat), VA_fact))
# df_new_cat_var = aggregate(.~index , simplify=FALSE,
# data = df_new_merge[c("index",names_cat)], VA_fact)
}
df_new_var <- merge(df_new_num_var, df_new_cat_var, by = "index")
}
# Add the uncovered rows, fill in with NA
num_row_new <- nrow(df_new)
if (num_row_new < num_row_origin) {
print(paste0("Covered number of rows: ", num_row_new))
print(paste0("Original number of rows: ", num_row_origin))
warning(
"Warning: All the rows in the original dataset is not selected in the
bootstrap samples. An increase on the number of bootstrap samples is
suggested.\n"
)
}
df_idx <- data.frame("index" = c(1:num_row_origin))
df_result_with_idx <- merge(
x = df_idx,
y = df_new,
by = "index",
all = TRUE
)
df_result_var_with_idx <- merge(
x = df_idx,
y = df_new_var,
by = "index",
all = TRUE
)
# remove index column and the combined categorical columns
if (is_onehot && exist_cat) {
df_result_disj <-
df_result_with_idx[, !(names(df_result_with_idx) %in% c(
names_cat, "index"
))]
df_result_var_disj <-
df_result_var_with_idx[, !(names(df_result_with_idx) %in% c(
names_cat, "index"
))]
} else {
df_result_disj <- df_result_with_idx[-c(1)]
df_result_var_disj <- df_result_var_with_idx[-c(1)]
}
# Final result for the categorical variables
if (is_onehot) {
names_num <- colnames(ls_df[[1]])[c(col_con, col_dis)]
# remove index and onehot columns, +1 is for the adjustment
df_result <- df_result_with_idx[, (
names(df_result_with_idx) %in% c(names_cat, names_num))]
df_result_var <- df_result_var_with_idx[, (
names(df_result_var_with_idx) %in% c(names_cat, names_num))]
} else {
df_result <- df_result_disj
df_result_var <- df_result_var_disj
}
return(
list(
"imp.disj" = df_result_disj,
"uncertainty.disj" = df_result_var_disj,
"imp" = df_result,
"uncertainty" = df_result_var
)
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.