Nothing
#' Create indices for nonparametric bootstrap
#'
#' \code{nonparametric} is used for nonparametric resampling, for example
#' nonparametric case or error/residual resampling. The function takes a vector
#' of indices that correspond to the indices of observations that should be used
#' in the resampling procedure.
#'
#' @param indices A vector of indices (integer) from which to sample.
#' @param R An integer specifying the number of resamples.
#' @param size An integer specifying the size of the resample. Standard
#' bootstrap suggests to resample as many datapoints as in the original sample,
#' which is set as the default.
#' @param replacement A logical value whether to sample with (TRUE) or without
#' (FALSE) replacement. Standard bootstrap suggests to resample with
#' replacement, which is set as the default.
#' @param seed \code{NULL} if seed should not be set explicitly or an integer to
#' which the seed is set. Since this function is usually used inside other
#' functions, it might not be desirable to set a seed explicitly.
#'
#' @return \code{nonparametric} returns a list of length \code{R} containing
#' vectors with the resampled indices.
#'
#' @export
nonparametric <- function(indices, R, size = length(indices),
replacement = TRUE, seed = NULL) {
if (!is.null(seed)) {
if (is.numeric(seed)){
if (!(seed %% 1 == 0)) {
stop(strwrap("Argument 'seed' must either be NULL or an integer",
prefix = " ", initial = ""))
} else {
set.seed(seed = seed)
}
} else { # non-numeric
stop(strwrap("Argument 'seed' must either be NULL or an integer",
prefix = " ", initial = ""))
}
}
resamples <- vector("list", length = R)
for (i in 1:R) {
resamples[[i]] <- sample(x = indices, size = size, replace = replacement)
}
return(resamples)
}
#' Counts the number of times each index was sampled
#'
#' \code{count_indices} takes a list of indices for resampling and counts how
#' often each index was sampled in each resample. The result is returned in two
#' versions of a matrix where each row corresponds to a different resample and
#' each column to one index.
#'
#' @param resamples A list of resamples, as created by \link{nonparametric}.
#' @param indices The vector of original indices from which the resamples were
#' drawn.
#'
#' @return \code{count_indices} returns a list with two names elements. Each
#' element is a matrix that stores how often each observation/index was
#' resampled (column) for each resample (row). \code{$count_clean} only has
#' columns for observations that were available in the indices.
#' \code{$count_all} counts the occurrence of all indices in the range of
#' indices that were provided, even if the index was actually not available in
#' the given indices. These are of course zero since they were not available for
#' resampling. If the given indices do not skip any numbers, the two coincide.
#'
#' @export
count_indices <- function(resamples, indices) {
# take max value as the number of indices so each index gets its own bin
bins <- max(indices)
index_counts <- NULL
for (i in 1:length(resamples)) {
tab <- tabulate(resamples[[i]], nbins = bins)
index_counts <- rbind(index_counts, tab)
}
rownames(index_counts) <- NULL
index_counts_clean <- index_counts[, indices]
colnames(index_counts) <- paste("o", 1:bins, sep = "")
colnames(index_counts_clean) <- paste("o", indices, sep = "")
out <- list(count_all = index_counts, count_clean = index_counts_clean)
return(out)
}
#' Nonparametric resampling from a data frame
#'
#' @param df Data frame containing observations to be sampled from.
#' @param resample A vector of indices that extract the observations from the
#' data frame.
#'
#' @return \code{nonparametric_resampling} returns a data frame containing the
#' observations of the resample.
#'
#' @details
#' The input to the \code{resample} argument could for example be generated as
#' one of the elements in the list generated by the command
#' \link{nonparametric}.
#'
#' The input to the \code{df} argument would be the original data frame for case
#' resampling. For error/residual resampling, it would be a data frame
#' containing the residuals from the model.
#'
#' @export
nonparametric_resampling <- function(df, resample) {
new_sample <- df[resample, ]
return(new_sample)
}
#' Uses nonparametric case resampling for standard errors of parameters and
#' gauge
#'
#' @param robust2sls_object An object of class \code{"robust2sls"}.
#' @param R An integer specifying the number of resamples.
#' @param coef A numeric or character vector specifying which structural
#' coefficient estimates should be recorded across bootstrap replications.
#' \code{NULL} means all coefficients are recorded.
#' @param m A single numeric or vector of integers specifying for which
#' iterations the bootstrap statistics should be calculated. \code{NULL} means
#' they are calculated for all iterations that were also done in the original
#' robust2sls_object. Character \code{"convergence"} means all bootstrap samples
#' are run until they converge and the statistics of the first convergent
#' iteration is recorded.
#' @param parallel A logical value indicating whether to run the bootstrap
#' sampling in parallel or sequentially. See Details.
#'
#' @details
#' Argument \code{parallel} allows for parallel computing using the
#' \link{foreach} package, so the user has to register a parallel backend before
#' invoking this command.
#'
#' Argument \code{coef} is useful if the model includes many controls whose
#' parameters are not of interest. This can reduce the memory space needed to
#' store the bootstrap results.
#'
#' @return \code{case_resampling} returns an object of class
#' \code{"r2sls_boot"}. This is a list with three named elements. \code{$boot}
#' stores the bootstrap results as a data frame. The columns record the
#' different test statistics, the iteration \code{m}, and the number of the
#' resample, \code{r}. The values corresponding to the original data is stored
#' as \code{r = 0}. \code{$resamples} is a list of length \code{R} that stores
#' the indices for each specific resample. \code{$original} stores the original
#' \code{robust2sls_object} based on which the bootstrapping was done.
#'
#' @export
case_resampling <- function(robust2sls_object, R, coef = NULL, m = NULL,
parallel = FALSE) {
# extract all the function settings
formula <- robust2sls_object$cons$formula
ref_dist <- robust2sls_object$cons$reference
sign_level <- robust2sls_object$cons$sign_level
initial_est <- robust2sls_object$cons$initial$estimator
user_model <- robust2sls_object$cons$initial$user
iterations <- robust2sls_object$cons$iterations$setting
convergence_criterion <- robust2sls_object$cons$convergence$criterion
shuffle <- robust2sls_object$cons$initial$shuffle
shuffle_seed <- robust2sls_object$cons$initial$shuffle_seed
split <- robust2sls_object$cons$initial$split
verbose <- robust2sls_object$cons$verbose
# extract coefficient numbers (if given names)
full <- ivreg::ivreg(formula = formula, data = robust2sls_object$cons$data)
if (is.character(coef)) {
coef.num <- which(coef == names(full$coefficients))
} else if (is.numeric(coef)) {
coef.num <- coef
} else if (is.null(coef)) {
coef.num <- 1:length(full$coefficients)
}
# extract original data and prepare index resampling
orig_data <- robust2sls_object$cons$data
nonmiss <- nonmissing(data = orig_data, formula = robust2sls_object$cons$formula)
num.nonmiss <- sum(nonmiss)
indices <- 1:NROW(orig_data) # if every observation was useable
indices <- indices[nonmiss] # exclude the missing ones
resamples <- nonparametric(indices = indices, R = R)
# re-build function call for the resamples; will be the same as the original
# call except that it uses "data = resample"
a <- paste("new_model <- outlier_detection(data = resample, formula = formula,
ref_dist = ref_dist, sign_level = sign_level,
initial_est = initial_est, user_model = user_model,
iterations = iterations,
convergence_criterion = convergence_criterion, shuffle = shuffle,
shuffle_seed = shuffle_seed, split = split, verbose = verbose)")
expr <- parse(text = a)
# which iterations to extract?
if (is.null(m)) { # extract all
n.iterations <- robust2sls_object$cons$iterations$actual
# e.g. if had two iterations, access coefficients of models 0, 1, 2
# these correspond to the first, second, and third elements of $model
mb <- 1:(n.iterations + 1)
# e.g. if had two iterations, calculate gauge for 0, 1, 2
# fun outliers_prop() refers to iteration directly, not number of element
mg <- 0:n.iterations
o.mb <- mb
o.mg <- mg
} else if (is.numeric(m)) { # specified as vector of iterations
mg <- m
mb <- m + 1
o.mb <- mb
o.mg <- mg
} else { # m is set to "convergence" -> need to determine m in each resample
mg <- NULL
mb <- NULL
} # end determine iterations
# extract the original sample results (call it r = 0)
if (identical(m, "convergence")) {
# converged at which iteration?
o.conv <- robust2sls_object$cons$convergence$iter
o.mb <- o.conv + 1
o.mg <- o.conv
}
if (identical(initial_est, "saturated")) {
o.NA_coef <- robust2sls_object$model$m0$split1$coefficients
o.NA_names <- names(o.NA_coef)
o.NA_coef <- rep(NA, length(o.NA_coef))
names(o.NA_coef) <- o.NA_names
robust2sls_object$model$m0$split1 <- NULL
robust2sls_object$model$m0$split2 <- NULL
robust2sls_object$model$m0$coefficients <- o.NA_coef
}
if (is.null(coef)) { # extract all
o.betas <- sapply(X = o.mb, FUN = function(robust2sls_object, iteration)
robust2sls_object$model[[iteration]][["coefficients"]],
robust2sls_object = robust2sls_object)
} else { # same code works for indices and names
o.betas <- sapply(X = o.mb, FUN = function(robust2sls_object, iteration, c)
robust2sls_object$model[[iteration]][["coefficients"]][c],
robust2sls_object = robust2sls_object, c = coef)
}
if (is.vector(o.betas)) { # if only one coefficient (either in whole model or because of selection)
names(o.betas) <- NULL
o.name <- names(robust2sls_object$model$m0$coefficients)[coef.num]
o.betas <- matrix(o.betas, ncol = 1)
colnames(o.betas) <- o.name
o.intermediate <- data.frame(o.betas)
} else {
o.intermediate <- data.frame(t(o.betas))
}
o.record_iter <- paste("m", o.mg, sep = "")
o.intermediate$m <- o.record_iter
o.gauges <- sapply(X = o.mg, FUN = outliers_prop,
robust2sls_object = robust2sls_object)
o.intermediate$gauge <- o.gauges
o.intermediate$r <- 0
orig <- o.intermediate
if (parallel == FALSE) {
output <- data.frame()
for (r in 1:R) {
resample <- nonparametric_resampling(df = orig_data,
resample = resamples[[r]])
new_model <- outlier_detection(data = resample, formula = formula,
ref_dist = ref_dist, sign_level = sign_level,
initial_est = initial_est, user_model = user_model,
iterations = iterations,
convergence_criterion = convergence_criterion, shuffle = shuffle,
shuffle_seed = shuffle_seed, split = split, verbose = verbose)
if (identical(m, "convergence")) {
# converged at which iteration?
iter_conv <- new_model$cons$convergence$iter
mb <- iter_conv + 1
mg <- iter_conv
}
if (identical(initial_est, "saturated")) {
NA_coef <- new_model$model$m0$split1$coefficients
NA_names <- names(NA_coef)
NA_coef <- rep(NA, length(NA_coef))
names(NA_coef) <- NA_names
new_model$model$m0$split1 <- NULL
new_model$model$m0$split2 <- NULL
new_model$model$m0$coefficients <- NA_coef
}
if (is.null(coef)) { # extract all
betas <- sapply(X = mb, FUN = function(robust2sls_object, iteration)
robust2sls_object$model[[iteration]][["coefficients"]],
robust2sls_object = new_model)
} else { # same code works for indices and names
betas <- sapply(X = mb, FUN = function(robust2sls_object, iteration, c)
robust2sls_object$model[[iteration]][["coefficients"]][c],
robust2sls_object = new_model, c = coef)
}
if (is.vector(betas)) { # if only one coefficient (either in whole model or because of selection)
names(betas) <- NULL
name <- names(new_model$model$m0$coefficients)[coef.num]
betas <- matrix(betas, ncol = 1)
colnames(betas) <- name
intermediate <- data.frame(betas)
} else {
intermediate <- data.frame(t(betas))
}
record_iter <- paste("m", mg, sep = "")
intermediate$m <- record_iter
gauges <- sapply(X = mg, FUN = outliers_prop,
robust2sls_object = new_model)
intermediate$gauge <- gauges
intermediate$r <- r
output <- rbind(output, intermediate)
} # end bootstrap loop
} else {
# parallel loop
output <- foreach::foreach(r = 1:R, .combine = "rbind") %dopar% {
resample <- nonparametric_resampling(df = orig_data,
resample = resamples[[r]])
new_model <- outlier_detection(data = resample, formula = formula,
ref_dist = ref_dist, sign_level = sign_level,
initial_est = initial_est, user_model = user_model,
iterations = iterations,
convergence_criterion = convergence_criterion, shuffle = shuffle,
shuffle_seed = shuffle_seed, split = split, verbose = verbose)
mb2 <- mb
mg2 <- mg
if (identical(m, "convergence")) {
# converged at which iteration?
iter_conv <- new_model$cons$convergence$iter
mb2 <- iter_conv + 1
mg2 <- iter_conv
}
if (identical(initial_est, "saturated")) {
NA_coef <- new_model$model$m0$split1$coefficients
NA_names <- names(NA_coef)
NA_coef <- rep(NA, length(NA_coef))
names(NA_coef) <- NA_names
new_model$model$m0$split1 <- NULL
new_model$model$m0$split2 <- NULL
new_model$model$m0$coefficients <- NA_coef
}
if (is.null(coef)) { # extract all
betas <- sapply(X = mb2, FUN = function(robust2sls_object, iteration)
robust2sls_object$model[[iteration]][["coefficients"]],
robust2sls_object = new_model)
} else { # same code works for indices and names
betas <- sapply(X = mb2, FUN = function(robust2sls_object, iteration, c)
robust2sls_object$model[[iteration]][["coefficients"]][c],
robust2sls_object = new_model, c = coef)
}
if (is.vector(betas)) { # if only one coefficient (either in whole model or because of selection)
names(betas) <- NULL
name <- names(new_model$model$m0$coefficients)[coef.num]
betas <- matrix(betas, ncol = 1)
colnames(betas) <- name
intermediate <- data.frame(betas)
} else {
intermediate <- data.frame(t(betas))
}
record_iter <- paste("m", mg2, sep = "")
intermediate$m <- record_iter
gauges <- sapply(X = mg2, FUN = outliers_prop,
robust2sls_object = new_model)
intermediate$gauge <- gauges
intermediate$r <- r
intermediate
} # end parallel bootstrap loop
} # end if parallel TRUE/FALSE
# append the original results
output <- rbind(orig, output)
# if (identical(m, "convergence")) {
# # rename the m-column
# names(output)[names(output) == "m"] <- "conv"
# #output$conv <- output$m
# #output$m <- NULL
# }
out <- list(boot = output, resamples = resamples,
original = robust2sls_object)
class(out) <- "r2sls_boot"
return(out)
}
#' Extracts bootstrap results for a specific iteration
#'
#' @param r2sls_boot An object of \link{class} \code{"r2sls_boot"}, as returned
#' by \link{case_resampling}.
#' @param iteration An integer >= 0 specifying which bootstrap results to
#' extract.
#'
#' @return \code{extract_boot} returns a matrix with the bootstrap results for
#' a specific iteration.#'
#'
#' @export
extract_boot <- function(r2sls_boot, iteration) {
if (identical(iteration, "convergence")) {
extraction <- r2sls_boot$boot
} else {
iterstring <- paste("m", iteration, sep = "")
extraction <- r2sls_boot$boot[r2sls_boot$boot$m == iterstring, ]
}
return(extraction)
}
#' Evaluate bootstrap results
#'
#' @param r2sls_boot An object of \link{class} \code{"r2sls_boot"}, as returned
#' by \link{case_resampling}.
#' @param iterations An integer or numeric vector with values >= 0 specifying
#' which bootstrap results to evaluate.
#'
#' @return \code{evaluate_boot} returns a data frame with the bootstrap and the
#' theoretical standard errors. Each row corresponds to a different iteration
#' step while each column refers to the parameters whose standard errors are
#' produced.
#'
#' @export
evaluate_boot <- function(r2sls_boot, iterations) {
# extract all the function settings
ref_dist <- r2sls_boot$original$cons$reference
sign_level <- r2sls_boot$original$cons$sign_level
initial_est <- r2sls_boot$original$cons$initial$estimator
if (identical(initial_est, "saturated")) {
split <- r2sls_boot$original$cons$initial$split
} else { # then it doesn't matter, any value between 0 and 1 is fine
split <- 0.5
}
n <- sum(nonmissing(r2sls_boot$original$cons$data,
r2sls_boot$original$cons$formula))
# will store the results
bootstrap_se <- data.frame()
for (i in iterations) {
extraction <- extract_boot(r2sls_boot = r2sls_boot, iteration = i)
# no need to calculate statistics on m or r
extraction$r <- NULL
extraction$m <- NULL
sd <- data.frame(t(apply(X = extraction, MARGIN = 2, FUN = sd)))
# estimate parameters for theoretical avar of gauge; then estimate avar
p_est <- estimate_param_null(r2sls_boot$original)
avar_est <- gauge_avar(ref_dist = ref_dist, sign_level = sign_level,
initial_est = initial_est, iteration = i,
parameters = p_est, split = split)
sd$theory_gauge <- sqrt(avar_est / n)
sd$m <- i
# append results
bootstrap_se <- rbind(bootstrap_se, sd)
}
return(bootstrap_se)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.