#' jacksample: create several dataframes by jackknife resampling
#'
#' @description
#' \code{jacksample} function creates \code{num_sample} samples without the
#' group of observation k (k=1,...,\code{num_sample}).
#' We regroup the original data into \code{num_sample} groups by their index.
#' @param df A complete or incomplete dataframe
#' @param num_sample Number of jackknifed samples.
#' @return A list of jackknifed dataframes.
#' @export
#' @references Statistical Analysis with Missing Data, by Little and Rubin, 2002
#' @examples
#' n <- 10000
#' mu.X <- c(1, 2, 3)
#' Sigma.X <- matrix(c(9, 3, 2, 3, 4, 0, 2, 0, 1), nrow = 3)
#' X.complete.cont <- MASS::mvrnorm(n, mu.X, Sigma.X)
#' rs <- generate_miss(X.complete.cont, 0.5, mechanism = "MNAR2")
#' ls_boot <- jacksample(rs$X.incomp, 4)
jacksample <- function(df, num_sample) {
num_row <- nrow(df)
row_tranch <- num_row %/% num_sample
rest <- num_row %% num_sample
ls_df_new <- list()
i <- 1
start <- 1
while (i <= num_sample - rest) {
# num_sample-rest samples with row_tranch rows
end <- start + row_tranch - 1
if (start == 1) {
df_new <- df[(end + 1):num_row, ]
} else if (end == num_row) {
df_new <- df[1:(start - 1), ]
} else {
df_new <- df[c(1:(start - 1), (end + 1):num_row), ]
}
# df_new[["old_idx"]] = c(start:end)
ls_df_new[[i]] <- df_new
i <- i + 1
start <- end + 1
}
while (i <= num_sample) {
# rest samples with row_tranch+1 rows
end <- start + (row_tranch + 1) - 1
if (end >= num_row) {
df_new <- df[c(1:(start - 1)), ]
} else {
df_new <- df[c(1:(start - 1), (end + 1):num_row), ]
}
# df_new[["old_idx"]] = c(start:end)
ls_df_new[[i]] <- df_new
i <- i + 1
start <- end + 1
}
return(ls_df_new)
}
#' combine_jack: combine imputed jackknife datasets
#'
#' @description
#' \code{combine_jack} function combines several imputed jackknifed dataframes
#' into the final imputed dataframe and provide the variance for each imputed
#' value.
#' @param ls_df A list of imputed jackknifed dataframes.
#' @param col_con Continous columns index.
#' @param col_dis Discret columns index.
#' @param col_cat Categorical columns index.
#' @param method The encoded method of categorical columns in the imputed
#' dataframes.
#' This function is only coded for "onehot" situation.
#' @param dict_cat The dictionary of categorical columns names if "onehot"
#' method is applied.
#' For example, it could be list("Y7"=c("Y7_1","Y7_2"),
#' "Y8"=c("Y8_1","Y8_2","Y8_3")).
#' @param var_cat The method of variance calculation for the categorical
#' columns.
#' "unalike" will lead to the calculation of unalikeability, while "wilcox_va"
#' will lead to the calculation of Wilcox index: VarNC.
#' @return \code{df_result_disj} The final imputed dataframe with the
#' categorical columns in onehot form.
#' @return \code{df_result_var_disj} The variance matrix for the final
#' imputation dataframe with the categorical columns in onehot form.
#' @return \code{df_result} The final imputed dataframe with the categorical
#' columns in factor form.
#' @return \code{df_result_var} The variance matrix for the final imputation
#' dataframe with the categorical columns in factor form.
#' @export
#' @importFrom dplyr %>%
#' @references Statistical Analysis with Missing Data, by Little and Rubin, 2002
combine_jack <- function(ls_df,
col_con,
col_dis = c(),
col_cat = c(),
method = "onehot",
dict_cat = NULL,
var_cat = "unalike") {
ls_df_minus <- ls_df
ls_col_name <- colnames(ls_df[[1]])
exist_dis <- !all(c(0, col_dis) == c(0))
if (exist_dis) {
col_name_dis <- ls_col_name[col_dis]
}
exist_cat <- !all(c(0, col_cat) == c(0))
if (exist_cat) {
col_name_cat <- ls_col_name[col_cat]
var_cat <- match.arg(var_cat, c("unalike", "wilcox_va"))
is_unalike <- (var_cat == "unalike")
if (is_unalike) {
is_uwo4419_package_installed()
} else {
is_qualvar_package_installed()
}
if (is.null(dict_cat)) {
stop("dict_cat is needed when there are onehot categorical columns")
}
}
i <- 1
n_sample <- length(ls_df)
# df_full$index <- as.numeric(row.names(df_full))
while (i <= n_sample) {
# ls_df[[i]]$index <- as.numeric(row.names(ls_df[[i]]))
# # only for numeric columns
# ls_df_minus[[i]] <- n_sample * df_full[ls_df[[i]]$index, ] - (n_sample -
# 1) * ls_df[[i]]
# if (exist_cat) {
# ls_df_minus[[i]][col_name_cat][ls_df_minus[[i]][col_name_cat] < 0] <- 0
# }
ls_df_minus[[i]][["index"]] <- as.numeric(row.names(ls_df_minus[[i]]))
i <- i + 1
}
# Put all imputed datasets together
df_new_merge <- Reduce(function(dtf1, dtf2) {
rbind(dtf1, dtf2)
}, ls_df_minus)
# Add the combined result for categorical variables deducted from the onehot
# result
if (exist_cat) {
names_cat <- names(dict_cat)
for (name in names_cat) {
df_new_merge[[name]] <- apply(
df_new_merge[dict_cat[[name]]],
1, which_max_cat, name, dict_cat
)
df_new_merge[[name]] <- unlist(df_new_merge[[name]])
df_new_merge[[name]] <- factor(df_new_merge[[name]])
}
}
# By Jackknife combining rules for disjunctive part
# Here for each indice, there are n_sample-1 values
df_new <- stats::aggregate(. ~ index, data = df_new_merge[
c("index", ls_col_name)
], mean)
if (exist_dis) {
# round for the discret variables
df_new[col_name_dis] <- round(df_new[col_name_dis])
}
df_new_var_disj <- stats::aggregate(. ~ index,
data = df_new_merge[c(
"index",
ls_col_name
)], var
)
# df_new_var_disj[c(ls_col_name)] <- df_new_var_disj[c(ls_col_name)] / (
# n_sample - 1)
# Final result according to the mean of onehot probability
if (exist_cat) {
for (name in names_cat) {
df_new[[name]] <- apply(
df_new[dict_cat[[name]]],
1, which_max_cat, name, dict_cat
)
df_new[[name]] <- unlist(df_new[[name]])
df_new[[name]] <- factor(df_new[[name]])
}
# remove the index and the onehot columns
df_result <- df_new[-c(1, col_cat + 1)]
if (is_unalike) {
df_new_cat_var <- stats::aggregate(. ~ index, data = df_new_merge[
c("index", names_cat)
], uwo4419::unalike)
} else {
df_new_cat_var <- df_new_merge[c("index", names_cat)] %>%
dplyr::group_by(.data$index) %>%
dplyr::summarise(dplyr::across(dplyr::all_of(names_cat), VA_fact))
}
df_new_var <- merge(df_new_var_disj, df_new_cat_var, by = "index")
# remove the index and the one-hot columns
df_result_var <- df_new_var[-c(1, col_cat + 1)]
# remove index column and the combined categorical columns
df_result_disj <- df_new[, !(names(df_new) %in% c(names_cat, "index"))]
df_result_var_disj <- df_new_var[, !(
names(df_new) %in% c(names_cat, "index"))]
return(
list(
imp.disj = df_result_disj,
uncertainty.disj = df_result_var_disj,
imp = df_result,
uncertainty = df_result_var
)
)
} else {
return(list(
imp = df_new[-c(1)],
imp.disj = df_new[-c(1)],
uncertainty = df_new_var_disj[-c(1)],
uncertainty.disj = df_new_var_disj[-c(1)]
))
}
}
# Version jackknife estimate using estimation on the full incomplete dataset
# combine_jack_bis <- function(ls_df,
# df_full,
# col_con,
# col_dis = c(),
# col_cat = c(),
# method = "onehot",
# dict_cat = NULL,
# var_cat = "unalike") {
# ls_df_minus <- ls_df
# ls_col_name <- colnames(ls_df[[1]])
# exist_dis <- !all(c(0, col_dis) == c(0))
# if (exist_dis) {
# col_name_dis <- ls_col_name[col_dis]
# }
# exist_cat <- !all(c(0, col_cat) == c(0))
# if (exist_cat) {
# col_name_cat <- ls_col_name[col_cat]
# var_cat <- match.arg(var_cat, c("unalike", "wilcox_va"))
# is_unalike <- (var_cat == "unalike")
# if (is_unalike) {
# is_uwo4419_package_installed()
# } else {
# is_qualvar_package_installed()
# }
# if (is.null(dict_cat)) {
# stop("dict_cat is needed when there are onehot categorical columns")
# }
# }
# i <- 1
# n_sample <- length(ls_df)
# df_full$index <- as.numeric(row.names(df_full))
# while (i <= n_sample) {
# ls_df[[i]]$index <- as.numeric(row.names(ls_df[[i]]))
# # only for numeric columns
# ls_df_minus[[i]] <- n_sample * df_full[ls_df[[i]]$index, ] - (n_sample -
# 1) * ls_df[[i]]
# if (exist_cat) {
# ls_df_minus[[i]][col_name_cat][ls_df_minus[[i]][col_name_cat] < 0] <- 0
# }
# ls_df_minus[[i]][["index"]] <- as.numeric(row.names(ls_df_minus[[i]]))
# i <- i + 1
# }
# # Put all imputed datasets together
# df_new_merge <- Reduce(function(dtf1, dtf2) {
# rbind(dtf1, dtf2)
# }, ls_df_minus)
# # Add the combined result for categorical variables deducted from the
# one-hot result
# if (exist_cat) {
# names_cat <- names(dict_cat)
# for (name in names_cat) {
# df_new_merge[[name]] <- apply(df_new_merge[dict_cat[[name]]], 1,
# which_max_cat, name, dict_cat)
# df_new_merge[[name]] <- unlist(df_new_merge[[name]])
# df_new_merge[[name]] <- factor(df_new_merge[[name]])
# }
# }
#
#
#
# # By Jackknife combining rules for disjunctive part
# df_new <- stats::aggregate(. ~ index, data = df_new_merge[c("index",
# ls_col_name)], mean) # Here for each indice, there are n_sample-1 values
# if (exist_dis) {
# round for the discret variables
# df_new[col_name_dis] <- round(df_new[col_name_dis])
# }
# df_new_var_disj <- stats::aggregate(. ~ index,
# data = df_new_merge[c("index", ls_col_name)], var)
# # df_new_var_disj[c(ls_col_name)] <- df_new_var_disj[c(ls_col_name)] / (
# n_sample - 1)
#
# # Final result according to the mean of onehot probability
# if (exist_cat) {
# for (name in names_cat) {
# df_new[[name]] <- apply(df_new[dict_cat[[name]]], 1, which_max_cat,
# name, dict_cat)
# df_new[[name]] <- unlist(df_new[[name]])
# df_new[[name]] <- factor(df_new[[name]])
# }
# df_result <- df_new[-c(1, col_cat + 1)] # remove the index and the
# onehot columns
# if (is_unalike) {
# df_new_cat_var <- stats::aggregate(. ~ index,
# data = df_new_merge[c("index", names_cat)], uwo4419::unalike)
# } else {
# df_new_cat_var <- df_new_merge[c("index", names_cat)] %>%
# dplyr::group_by(.data$index) %>%
# dplyr::summarise(dplyr::across(dplyr::all_of(names_cat), VA_fact))
# }
#
# df_new_var <- merge(df_new_var_disj, df_new_cat_var, by = "index")
# df_result_var <- df_new_var[-c(1, col_cat + 1)] # remove the index and
# the one-hot columns
#
#
# # remove index column and the combined categorical columns
# df_result_disj <- df_new[, !(names(df_new) %in% c(names_cat, "index"))]
# df_result_var_disj <-
# df_new_var[, !(names(df_new) %in% c(names_cat, "index"))]
# return(
# list(
# imp.disj = df_result_disj,
# uncertainty.disj = df_result_var_disj,
# imp = df_result,
# uncertainty = df_result_var
# )
# )
# } else {
# return(list(
# imp = df_new[-c(1)],
# imp.disj = df_new[-c(1)],
# uncertainty = df_new_var_disj[-c(1)],
# uncertainty.disj = df_new_var_disj[-c(1)]
# ))
# }
# }
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.