#' Generate `n_boot` bootstrap samples.
#' @details
#' The bootstrap samples are generated so that it is very unlikely that they
#' will be associated with a 0 variance variable. However such situation can
#' appear in very rare cases.
#'
#' The simplest procedure to avoid such situation would be to compute the
#' variance of each variable in each bootstrap sample. However, in
#' high-dimensional cases, this is time consuming. Thus, the function starts
#' by identifying variables that present a risk of being of null variance in
#' at least one bootstrap sample. A variable is detected as such if the
#' probability of sampling `N` times (where `N` is the number of observations)
#' its most frequent observed value is higher than a specific threshold
#' named `pval` (by default set to `1e-15`) corrected by the number of bootstrap
#' sample `n_boot` and the number of variables in the whole data-set
#' (\eqn{\sum_{j=1}^Jp_{j}}). In the end, a variable is defined as risky if the
#' proportion of its most frequent observed value is strictly higher than:
#' \deqn{risky\textunderscore threshold = \left(\frac{pval}{n_{boot}*
#' \left(\sum_{j=1}^J p_{j}\right)}\right)^{1/N}.}
#' This value is necessarily lower than `1` as `pval` is lower than `1`.
#' However, it could be lower than \eqn{1/N}, which means that all variables
#' are defined as risky. That is why the maximum value between
#' \eqn{risky\textunderscore threshold} and \eqn{1/N} is taken.
#'
#' Once risky variables have been identified, the function computes the
#' variance of each risky variable in each bootstrap sample. If there isn't any
#' 0 variance risky variable, the bootstrap samples are returned as is.
#' Otherwise, three cases are possible:
#' \itemize{
#' \item `keep_all_variables = F`, then the risky variables that are of 0
#' variance in at least one bootstrap sample are removed. If they correspond to
#' a whole block, an error message is generated.
#' \item `keep_all_variables = T`, two possibilities :
#' \itemize{
#' \item If `balance = T`, the procedure is repeated at most `5` times until all
#' bootstrap samples do not present any 0 variance variable.
#' \item If `balance = F`, an heuristic procedure is used to modify the sampling
#' probability of each observation in order to keep all variables.
#' }
#' }
#'
#' A short detail of this lastly mentioned heuristic consist in replacing each
#' observed value of the risky variables by \eqn{1-\frac{N_k}{N}}
#' (where \eqn{N_k} corresponds to the number of times this value is observed),
#' normalized so that the sum through all the observations (for each variable)
#' equals `1`. Then, if \eqn{prob_i} defines the sampling probability for the
#' observation associated with line \eqn{i}, it is defined as the maximum value
#' of the previous matrix through all risky variables (again normalized so that
#' \eqn{\sum_{i=1}^N prob_i = 1}). Thus the sampling probability of an
#' observation is more of less associated with `1 - its proportion in the
#' variable where this observation is in the lowest frequent group
#' (through all risky variables)`.
#' @inheritParams rgcca_bootstrap
#' @param pval For all the variables, a threshold for the proportion of the most
#' frequent value of this variable is computed. This threshold is evaluated so
#' that the probability to sample only this value is below `pval`. This
#' probability is corrected by the number of bootstrap samples and the number
#' of variables (default is `1e-15`).
#' @return \item{full_idx}{A list of size n_boot containing the observations
#' kept for each bootstrap sample.}
#' @return \item{sd_null}{A list of size the number of block
#' containing the variables that were removed from each block for all the
#' bootstrap samples. Variables are removed if they appear to be of null
#' variance in at least one bootstrap sample. If no variable is removed,
#' return NULL.}
#' @title Generate bootstrap samples.
#' @noRd
generate_resampling <- function(rgcca_res, n_boot, balanced = TRUE,
keep_all_variables = FALSE, pval = 1e-15,
verbose = TRUE) {
if (verbose) {
packageStartupMessage("Bootstrap samples sanity check...",
appendLF = FALSE
)
}
# Initialization
pval <- min(pval, 1)
NO_null_sd_var <- FALSE
iter <- 0
raw_blocks <- rgcca_res$call$blocks
N <- NROW(raw_blocks[[1]])
prob <- rep(1 / N, N)
# For any variable, threshold for the proportion of the most frequent value
# of a variable. This threshold is computed so that the probability to sample
# only this value is below `pval`. This probability is corrected by the number
# of bootstrap samples and the number of variables.
risky_threshold <- max(
1 / N,
(pval / (n_boot * sum(vapply(raw_blocks, NCOL, FUN.VALUE = 1L))))^(1 / N)
)
# Identify variables with value having an observed proportion higher than
# risky_threshold.
risky_var <- lapply(
raw_blocks,
function(block) {
which(apply(
block, 2,
function(x) {
max(table(x) / N) > risky_threshold
}
))
}
)
# Keep only risky variables for each block.
raw_blocks_filtered <- Map(
function(x, y) x[, y, drop = FALSE],
raw_blocks, risky_var
)
# While there are variables with null variance among the risky variables.
while (!NO_null_sd_var) {
if (balanced) { # Balanced bootstrap sampling.
full_idx <- rep(x = seq(N), each = n_boot)
full_idx <- sample(x = full_idx, size = N * n_boot, replace = FALSE)
full_idx <- split(full_idx, ceiling(seq_along(full_idx) / N))
} else { # Unbalanced bootstrap sampling.
full_idx <- lapply(
seq(n_boot),
function(x) {
sample(x = seq(N), replace = TRUE, prob = prob)
}
)
}
# Compute blocks for each bootstrap sample (only with risky variables).
boot_blocks_filtered <-
lapply(
full_idx,
function(idx) {
lapply(
raw_blocks_filtered,
function(x) {
y <- x[idx, , drop = FALSE]
rownames(y) <- paste("S", seq_along(idx))
return(y)
}
)
}
)
# For each sample, identify variables with a single unique value.
boot_column_sd_null <-
lapply(
boot_blocks_filtered,
function(boot) {
lapply(boot, function(boot_bl) {
which(apply(boot_bl, 2, function(x) length(unique(x)) == 1))
})
}
)
# Summarize through all the samples.
eval_boot_sample <- vapply(
boot_column_sd_null,
function(x) sum(vapply(x, length, FUN.VALUE = 1L)),
FUN.VALUE = 1L
)
NO_null_sd_var <- (sum(eval_boot_sample) == 0)
if (NO_null_sd_var) {
# through all samples, all variables have non null variances.
sd_null <- NULL
} else {
# If at least one sample have been identified with a null
# variance variable.
if (!keep_all_variables) {
# It is allowed to remove variables.
# Extract the troublesome variables.
sd_null <- Reduce("rbind", boot_column_sd_null)
rownames(sd_null) <- NULL
sd_null <- apply(sd_null, 2, function(x) unique(names(Reduce("c", x))))
sd_null <- Map(function(x, y) {
z <- match(x, y)
names(z) <- x
return(z)
}, sd_null, lapply(raw_blocks, colnames))
# Check if a whole block is troublesome
is_full_block_removed <- unlist(Map(
function(x, y) dim(x)[2] == length(y),
raw_blocks,
sd_null
))
if (sum(is_full_block_removed) == 0) {
# A whole block is NOT troublesome
NO_null_sd_var <- TRUE
warning(paste(
"Variables: ",
paste(names(Reduce("c", sd_null)), collapse = " - "),
"appear to be of null variance in some bootstrap samples",
"and thus were removed from all samples. \n",
"==> RGCCA is run again without these variables."
))
} else {
# If whole block IS troublesome then STOP
stop_rgcca(paste(
"The variance of all the variables from blocks: ",
paste(names(raw_blocks)[is_full_block_removed], collapse = " - "),
"appear to be null in some bootstrap samples.",
"Please consider removing them."
))
}
} else { # It is NOT allowed to remove variables.
# Generate at most five different re-sampling until not a single
# variable has a null variance.
if (iter > 5) { # Otherwise STOP.
# Extract the troublesome variables.
sd_null <- Reduce("rbind", boot_column_sd_null)
rownames(sd_null) <- NULL
sd_null <- apply(
sd_null, 2,
function(x) unique(names(Reduce("c", x)))
)
sd_null <- Map(function(x, y) {
z <- match(x, y)
names(z) <- x
return(z)
}, sd_null, lapply(raw_blocks, colnames))
error_message <- paste(
"Impossible to define all bootstrap samples",
"without variables with null variance. Please",
"consider removing these variables: ",
paste(names(Reduce("c", sd_null)), collapse = " - ")
)
# In the balanced case, you CANNOT play with the sampling probability
# of the different observations as it is unbalanced.
if (balanced) {
error_message <- paste0(
error_message,
". Please, consider unbalanced bootstrap by",
" setting 'balanced' to FALSE."
)
}
stop_rgcca(error_message)
}
# In the unbalanced case, you CAN play with the sampling probability
# of the different observations.
if (!balanced) {
if (iter == 0) { # The first time, you define your unbalancedness.
# Each observed value of the risky variables is replaced by
# `1 - the proportion of this observed value`, normalized so that
# the sum through all the observations (for each variable)
# equals `1`.
prob <- lapply(
raw_blocks_filtered,
function(block) {
apply(block, 2, function(var) {
occurences <- table(var, useNA = "ifany") / length(var)
new_idx <- match(as.character(var), names(occurences))
new_var <- as.matrix(occurences[new_idx])
new_var <- (1 - new_var) / sum(1 - new_var)
return(new_var)
})
}
)
# The sampling probability for each observation is associated with
# the maximum value of the previous matrix through all risky
# variables (again normalize so that `sum(prob) = 1`). Thus
# the sampling probability of an observation is more of less
# associated with `1 - its proportion in the variable where this
# observation is in the lowest frequent group (through all
# risky variables)`.
prob <- apply(Reduce("cbind", prob), 1, max) / sum(
apply(Reduce("cbind", prob), 1, max)
)
}
}
iter <- iter + 1
}
}
}
if (verbose) packageStartupMessage("OK")
return(list(full_idx = full_idx, sd_null = sd_null))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.