surveybootstrap: Bootstrap with Survey Data

Documented in rescaled.bootstrap.sample rescaled.bootstrap.sample.pureR

#####################################################
##' rescaled.bootstrap.sample
##'
##' Given a survey dataset and a description of the survey
##' design (ie, which combination of variables determines primary sampling
##' units, and which combination of variables determines strata), take
##' a bunch of bootstrap samples for the rescaled bootstrap estimator
##' (see Details).
##'
##' @details
##'
##' `survey.design` is a formula of the form
##'
##'    `weight ~ psu_vars + strata(strata_vars)`
##'
##' where:
##'   * `weight` is the variable with the survey weights
##'   * `psu_vars` has the form `psu_v1 + psu_v2 + ...`, where primary
##'     sampling units (PSUs) are determined by `psu_v1`, etc
##'   * `strata_vars` has the form `strata_v1 + strata_v2 + ...`, which
##'     determine strata
##'
##' Note that we assume that the formula uniquely specifies PSUs.
##' This will always be true if the PSUs were selected without replacement.
##' If they were selected with replacement, then it will be necessary
##' to make each realization of a given PSU in the sample a unique id.
##' The code below assumes that all observations within
##' each PSU (as identified by the design formula) are from the same draw
##' of the PSU.
##'
##' The rescaled bootstrap technique works by adjusting the
##' estimation weights based on the number of times each
##' row is included in the resamples. If a row is never selected,
##' it is still included in the returned results, but its weight
##' will be set to 0. It is therefore important to use estimators
##' that make use of the estimation weights on the resampled
##' datasets.
##'
##' We always take m_i = n_i - 1, according to the advice presented
##' in Rao and Wu (1988) and Rust and Rao (1996).
##'
##' (This is a C++ version; a previous version, written in pure R,
##' is called [rescaled.bootstrap.sample.pureR()] )
##'
##' References:
##' * Rust, Keith F., and J. N. K. Rao. "Variance estimation for complex surveys
##'   using replication techniques." *Statistical methods in medical research*
##'   5.3 (1996): 283-310.
##'  * Rao, Jon NK, and C. F. J. Wu. "Resampling inference with complex survey
##'    data." *Journal of the American Statistical Association*
##'    83.401 (1988): 231-241.
##'
##'
##' @param survey.data The dataset to use
##' @param survey.design A formula describing the design of the survey (see Details)
##' @param num.reps The number of bootstrap replication samples to draw
##' @param parallel If `TRUE`, use parallelization (via `plyr`)
##' @param paropts An optional list of arguments passed on to `plyr` to control
##'        details of parallelization
##' @return A list with `num.reps` entries. Each entry is a dataset which
##' has at least the variables `index` (the row index of the original
##' dataset that was resampled) and `weight.scale`
##' (the factor by which to multiply the sampling weights
##' in the original dataset).
##' @export
##' @examples
##'
##' survey <- MU284.complex.surveys[[1]]
##' boot_surveys <- rescaled.bootstrap.sample(survey.data = survey,
##'                                           survey.design = ~ CL,
##'                                           num.reps = 2)
##'
rescaled.bootstrap.sample <- function(survey.data,
                                      survey.design,
                                      parallel=FALSE,
                                      paropts=NULL,
                                      num.reps=1)
{

  survey.data$.internal_id <- 1:nrow(survey.data)

  design <- parse_design(survey.design)

  ## drop the "~" at the start of the formula
  psu.vars <- design$psu.formula[c(-1)][[1]]

  ## in the special case where there are no PSU vars, treat each row as
  ## its own PSU
  if (length(psu.vars)==1 & psu.vars=="1") {
    psu.vars <- as.name(".internal_id")
  }

  ## create a single variable with an id number for each PSU
  ## (we need this to use the C++ code, below)
  #survey.data$.cluster_id <- group_indices_(survey.data, .dots=all.vars(psu.vars))
  survey.data <- survey.data %>%
    group_by(!!!psu.vars) %>%
    mutate(.cluster_id = cur_group_id()) %>%
    ungroup()

  ## if no strata are specified, enclose the entire survey all in
  ## one stratum
  if (is.null(design$strata.formula)) {
    strata <- list(survey.data)
  } else {
    strata <- plyr::dlply(survey.data, design$strata.formula, identity)
  }

  ## get num.reps bootstrap resamples within each stratum,
  ## according to the rescaled bootstrap scheme
  ## (see, eg, Rust and Rao 1996)

  ## this llply call returns a list, with one entry for each stratum
  ## each stratum's entry contains a list with the bootstrap resamples
  ## (see the note for the inner llply call below)
  bs <- plyr::llply(strata,
              function(stratum.data) {

                ## TODO - need to handle the case where a stratum has
                ## only one PSU - right now this produces NaNs for the weights,
                ## which it should not...

                ## (this part is written in c++)
                res <- resample_stratum(stratum.data$.cluster_id,
                                        num.reps)

                colnames(res) <- paste0("rep.", 1:ncol(res))
                res <- cbind("index"=stratum.data$.internal_id,
                             res)

                return(res)
              })

  ## bs: list, one entry for each stratum
  ## each stratum's entry is a matrix.
  ## first column of the matrix is called 'index',
  ## which is the row number for the observation in the
  ## original dataset; there is one remaining column for each
  ## bootstrap resample. the entries of each column are the factors
  ## by which the original weights should be scaled

  bs.all <- do.call("rbind", bs)

  res <- plyr::alply(bs.all[,-1],
               2,
               function(this_col) {
                   return(data.frame(index=bs.all[,1],
                                     weight.scale=this_col))
               })

  return(res)

}

#####################################################
##' rescaled.bootstrap.sample.pureR
##'
##' (this is the pure R version; it has been supplanted by
##'  `rescaled.bootstrap.sample`, which is partially written in C++)
##'
##' given a survey dataset and a description of the survey
##' design (ie, which combination of vars determines primary sampling
##' units, and which combination of vars determines strata), take
##' a bunch of bootstrap samples for the rescaled bootstrap estimator
##' (see, eg, Rust and Rao 1996).
##'
##' Note that we assume that the formula uniquely specifies PSUs.
##' This will always be true if the PSUs were selected without replacement.
##' If they were selected with replacement, then it will be necessary
##' to make each realization of a given PSU in the sample a unique id.
##' Bottom line: the code below assumes that all observations within
##' each PSU (as identified by the design formula) are from the same draw
##' of the PSU.
##'
##' The rescaled bootstrap technique works by adjusting the
##' estimation weights based on the number of times each
##' row is included in the resamples. If a row is never selected,
##' it is still included in the returned results, but its weight
##' will be set to 0. It is therefore important to use estimators
##' that make use of the estimation weights on the resampled
##' datasets.
##'
##' We always take m_i = n_i - 1, according to the advice presented
##' in Rao and Wu (1988) and Rust and Rao (1996).
##'
##' @param survey.data the dataset to use
##' @param survey.design a formula describing the design of the survey (see below - TODO)
##' @param num.reps the number of bootstrap replication samples to draw
##' @param parallel if TRUE, use parallelization (via `plyr`)
##' @param paropts an optional list of arguments passed on to `plyr` to control
##'        details of parallelization
##' @return a list with `num.reps` entries. each entry is a dataset which
##' has at least the variables `index` (the row index of the original
##' dataset that was resampled) and `weight.scale`
##' (the factor by which to multiply the sampling weights
##' in the original dataset).
##' @details `survey.design` is a formula of the form\cr
##'    weight ~ psu_vars + strata(strata_vars),
##' where weight is the variable with the survey weights and psu
##' is the variable denoting the primary sampling unit
rescaled.bootstrap.sample.pureR <- function(survey.data,
                                      survey.design,
                                      parallel=FALSE,
                                      paropts=NULL,
                                      num.reps=1)
{

  survey.data$.internal_id <- 1:nrow(survey.data)

  design <- parse_design(survey.design)

  ## drop the "~" at the start of the formula
  psu.vars <- design$psu.formula[c(-1)][[1]]

  ## in the special case where there are no PSU vars, treat each row as
  ## its own PSU
  if (length(psu.vars)==1 & psu.vars=="1") {
    psu.vars <- as.name(".internal_id")
  }

  ## if no strata are specified, enclose the entire survey all in
  ## one stratum
  if (is.null(design$strata.formula)) {
    strata <- list(survey.data)
  } else {
    strata <- plyr::dlply(survey.data, design$strata.formula, identity)
  }

  ## get num.reps bootstrap resamples within each stratum,
  ## according to the rescaled bootstrap scheme
  ## (see, eg, Rust and Rao 1996)

  ## this llply call returns a list, with one entry for each stratum
  ## each stratum's entry contains a list with the bootstrap resamples
  ## (see the note for the inner llply call below)
  bs <- plyr::llply(strata,
              function(stratum.data) {

                ## figure out how many PSUs we have in our sample
                psu.count <- count(stratum.data,
                                   psu.vars)

                n.h <- nrow(psu.count)

                ## take m_h = n_h - 1, which is what the literature
                ## most commonly recommends
                m.h <- n.h - 1

                ## this llply call returns a list, with one entry for each bootstrap rep.
                ## each list entry has a data frame with the same number of rows as
                ## stratum.data,
                ## and with colums for
                ## the survey design variables, the .internal_id,
                ## r.hi, and weight.scale
                resamples <- plyr::llply(1:num.reps,
                                   ## for each bootstrap rep, this function returns
                                   function(rep) {

                                     ## sample m.h PSUs with replacement
                                     these.psu.samples <- sample(1:nrow(psu.count),
                                                                 m.h,
                                                                 replace=TRUE)

                                     ## r.hi is the number of times PSU i in stratum
                                     ## h was chosen in our resample
                                     r.hi <- count(data.frame(psu.row=these.psu.samples))

                                     r.hi$weight.scale <- r.hi$freq * (n.h / m.h)

                                     psu.count$freq <- NULL

                                     psu.count$r.hi <- 0
                                     psu.count$weight.scale <- 0

                                     psu.count[ r.hi$psu.row, "r.hi" ] <- r.hi$freq

                                     ## this is the factor by which we
                                     ## need to multiply
                                     ## the sampling weights
                                     ## (again, see, eg, Rust and Rao 1996, pg 292)
                                     psu.count[ r.hi$psu.row,
                                                "weight.scale" ] <- r.hi$weight.scale

                                     this.resample <- merge(stratum.data[,c(all.vars(psu.vars),
                                                                            ".internal_id")],
                                                            psu.count,
                                                            by=all.vars(psu.vars))
                                     this.resample$.internal_id.1 <- NULL

                                     return(this.resample)
                                   },
                                   .parallel=parallel,
                                   .paropts=paropts)

                return(resamples)
              })

  # now reassemble each stratum...
  res <- plyr::llply(1:num.reps,
               function(rep.idx) {
                 this.rep <- plyr::ldply(bs,
                                   function(this.stratum.samples) {
                                     return(this.stratum.samples[[rep.idx]])
                                   })

                 this.rep <- plyr::rename(this.rep,
                                    c(".internal_id"="index"))

                 return(this.rep)
               })

  return(res)
}