R/rescaled.bootstrap.R

Defines functions rescaled.bootstrap

Documented in rescaled.bootstrap

#' @title Draw bootstrap replicates
#'
#' @description Draw bootstrap replicates from survey data using either the rescaled
#'   bootstrap for stratified multistage sampling, presented by J. Preston
#'   (2009) or the Rao-Wu boostrap by J. N. K. Rao and C. F. J. Wu (1988) 
#'
#' @param dat either data frame or data table containing the survey sample
#' @param method for bootstrap replicates, either `"Preston"` or `"Rao-Wu"` 
#' @param REP integer indicating the number of bootstraps to be drawn
#' @param strata string specifying the column name in `dat` that is used for
#'   stratification. For multistage sampling multiple column names can be
#'   specified by `strata=c("strata1","strata2","strata3")` or 
#'   `strata=c("strata1>strata2>strata3")`. See Details for more
#'   information.
#' @param cluster string specifying the column name in `dat` that is used for
#'   clustering. For instance given a household sample the column containing
#'   the household ID should be supplied.
#'   For multistage sampling multiple column names can be specified
#'   by `cluster=c("cluster1","cluster2","cluster3")` or 
#'   `cluster=c("cluster1>cluster2>cluster3")`.
#'   See Details for more information.
#' @param fpc string specifying the column name in `dat` that contains the
#'   number of PSUs at the first stage. For multistage sampling the number of
#'   PSUs at each stage must be specified by `strata=c("fpc1","fpc2","fpc3")` 
#'   or `strata=c("fpc1>fpc2>fpc3")`.
#' @param single.PSU either "merge" or "mean" defining how single PSUs need to
#'   be dealt with. For `single.PSU="merge"` single PSUs at each stage are
#'   merged with the strata or cluster with the next least number of PSUs. If
#'   multiple of those exist one will be select via random draw. For
#'   `single.PSU="mean"` single PSUs will get the mean over all bootstrap
#'   replicates at the stage which did not contain single PSUs.
#' @param return.value either "data", "replicates" and/or "selection" 
#'   specifying the return value of the function. For "data" the survey data is 
#'   returned as class `data.table`, for "replicates" only the bootstrap replicates 
#'   are returned as `data.table`. For "selection" list of data.tables with 
#'   length of `length(strata)` is returned containing 1:`REP` 0-1 columns 
#'   indicating if a PSU was selected for each sampling stage.
#' @param run.input.checks logical, if TRUE the input will be checked before applying
#'   the bootstrap procedure
#' @param already.selected list of data.tables or `NULL` where each data.table contains 
#'   columns in `cluster`, `strata` and additionally 1:`REP` columns containing
#'   0-1 values which indicate if a PSU was selected for each bootstrap replicate.
#'   Each of the data.tables corresponds to one of the sampling stages. First entry
#'   in the list corresponds to the first sampling stage and so on.
#' @param seed integer specifying the seed for the random number generator.
#' 
#' @details For specifying multistage sampling designs the column names in
#' `strata`,`cluster` and `fpc` need to be seperated by ">".\cr
#' For multistage sampling the strings are read from left to right meaning that
#' the first vector entry or column name before the first ">" is taken as the column for
#' stratification/clustering/number of PSUs at the first and the last vector entry
#' or column after
#' the last ">" is taken as the column for stratification/clustering/number of
#' PSUs at the last stage.
#' If for some stages the sample was not stratified or clustered one must
#' specify this by "1" or "I", e.g. `strata=c("strata1","I","strata3")` or 
#' `strata=c("strata1>I>strata3")` if there was
#' no stratification at the second stage or 
#' `cluster=c("cluster1","cluster2","I")` respectively 
#' `cluster=c("cluster1>cluster2>I")`
#' if there were no clusters at the last stage.\cr
#' The number of PSUs at each stage is not calculated internally and must be
#' specified for any sampling design.
#' For single stage sampling using stratification this can usually be done by
#' adding over all sample weights of each PSU by each strata-code.\cr
#' Spaces in each of the strings will be removed, so if column names contain
#' spaces they should be renamed before calling this procedure!\cr 
#' If `already.selected` is supplied the sampling of bootstrap replicates 
#' considers if speficif PSUs have already been selected by a previous survey wave.
#' For a specific `strata` and `cluster` this could lead to more than `floor(n/2)` 
#' records selected. In that case records will be de-selected such that `floor(n/2)` records,
#' with `n` as the total number of records, are selected for each 
#' `strata` and `cluster`. This parameter ist mostly used by [draw.bootstrap] in
#' order to consider the rotation of the sampling units over time.
#'
#' @return returns the complete data set including the bootstrap replicates or
#'   just the bootstrap replicates, depending on `return.value="data"` or
#'   `return.value="replicates"` respectively.
#' @export rescaled.bootstrap
#'
#' @references 
#' Preston, J. (2009). Rescaled bootstrap for stratified multistage
#'   sampling. Survey Methodology. 35. 227-234.
#' Rao, J. N. K., and C. F. J. Wu. (1988). Resampling Inference with Complex Survey Data.
#'  Journal of the American Statistical Association 83 (401): 231–41.
#' @author Johannes Gussenbauer, Eileen Vattheuer, Statistics Austria
#'
#' @examples
#' 
#' library(surveysd)
#' library(data.table)
#' setDTthreads(1)
#' set.seed(1234)
#' eusilc <- demo.eusilc(n = 1,prettyNames = TRUE)
#' 
#' eusilc[,N.households:=uniqueN(hid),by=region]
#' eusilc.bootstrap <- rescaled.bootstrap(eusilc,REP=10,strata="region",
#'                                        cluster="hid",fpc="N.households")
#' 
#' eusilc[,new_strata:=paste(region,hsize,sep="_")]
#' eusilc[,N.housholds:=uniqueN(hid),by=new_strata]
#' eusilc.bootstrap <- rescaled.bootstrap(eusilc,REP=10,strata=c("new_strata"),
#'                                        cluster="hid",fpc="N.households")
#'
#'


rescaled.bootstrap <- function(
    dat, 
    method = c("Preston", "Rao-Wu"), 
    REP = 1000, 
    strata = "DB050>1",   
    cluster = "DB060>DB030",   
    fpc = "N.cluster>N.households",
    single.PSU = c("merge", "mean"), 
    return.value = c("data", "replicates"),
    run.input.checks = TRUE, 
    already.selected = NULL, 
    seed = NULL) {

  
  InitialOrder <- N <- SINGLE_BOOT_FLAG <- SINGLE_BOOT_FLAG_FINAL <- f <-
    n_prev <- n_draw_prev <- sum_prev <- n_draw <- strata_i <- 
    V1 <- fpc_i <- . <- new.var_col <- new_var <- NULL  
  
  dat <- copy(dat)
  method <- method[1]
  # check run.input.checks
  if (!is.logical(run.input.checks)) {
    stop("run.input.checks can only be logical")
  }
  if (run.input.checks) {
    # check input data
    if (!is.data.table(dat) & !is.data.frame(dat)) {
      stop("dat needs to be a data frame or data table")
    } else {
      dat <- data.table(dat)
    }
    # check seed
    if(!is.null(seed)){
      check.input(seed, input.name = "seed", input.length=1, input.type="numeric")
    }
  }
  set.seed(seed) 
  
  # prepare input
  removeCols <- c()  
  if (is.null(cluster)) {
    cluster <- generateRandomName(20, colnames(dat))
    removeCols <- c(removeCols, cluster)
    dat[, c(cluster) := 1:.N]
  }
  
  if (is.null(strata)) {
    strata <- generateRandomName(20, colnames(dat))
    removeCols <- c(removeCols, strata)
    dat[, c(strata) := 1]
  }
  
  
  # check inputs strata, cluster, fpc
  check.input(strata, input.name = "strata", input.type="character")
  check.input(fpc, input.name = "fpc", input.type="character")
  check.input(cluster, input.name = "cluster", input.type="character")
  
  if(length(strata)==1 && strata %like% ">"){
    strata <- unlist(strsplit(strata, ">"))
    strata <- gsub("\\s", "", strata)
  } 
  
  if(length(cluster)==1 && cluster %like% ">"){
    cluster <- unlist(strsplit(cluster, ">"))
    cluster <- gsub("\\s", "", cluster)
  }
  if(length(fpc)==1 && fpc %like% ">"){
    fpc <- unlist(strsplit(fpc, ">"))
    fpc <- gsub("\\s", "", fpc)
  }
  
  single.PSU <- single.PSU[1]  
  
  # continue input checks
  if (run.input.checks) {
    
    # check REP
    check.input(REP, input.name = "REP", input.length=1, input.type="numeric",
                decimal.possible = FALSE)
    
    # check method
    method <- match.arg(method)
    
    # Warning for Rao-Wu method
    if (method == "Rao-Wu") {
      warning("The 'Rao-Wu' method should only be used when the first stage sampling is conducted without replacement. Ensure that your sampling design follows this requirement.")
    }
    
    # check design variables
    # check length of strata, cluster, fpc
    if(length(strata) != length(fpc) | length(strata) != length(cluster)){
      stop("strata, cluster, and fpc need to have the same number of arguments",
           " either separated with '>' or be character vectors of the same length")
    }
    
    # cluster can only be "1" or "I" in the final stage
    if(length(cluster)>1 && any(cluster[1:(length(cluster)-1)] %in% c("I","1"))){
      stop("Specifying 'I' or '1' for parameter 'cluster' 
           can only be set in the final sampling stage!")
    }
    
    check.values <- c(strata, cluster, fpc)
    check.values <- check.values[!check.values %in% c("1", "I")]
    check.values <- check.values[!check.values %in% colnames(dat)]
    if (length(check.values) > 0) {
      stop("dat does not contain the column(s): ", check.values)
    }
    
    # check if there are missings in fpc
    if (any(is.na(dat[, .SD, .SDcols = c(fpc)]))) {
      stop("missing values not allowed in fpc")
    }
    
    # check return.value
    if (any(!return.value %in% c("data", "replicates", "selection"))) {
      stop("return.value can only take the values 'data', 'replicates' or 'selection'")
    }
    
    # check single.PSU
    if (is.null(single.PSU) || !single.PSU %in% c("merge", "mean")) {
      warning("single.PSU was not set to either 'merge' or 'mean'!\n Bootstrap",
              " replicates for single PSUs cases will be missing!")
      single.PSU <- FALSE
    }
    
    # check for each stage that PSUs are not in mutiple strata
    for(i in seq_along(strata)){
      if(!strata[i]%in%c("1","I") & !cluster[i]%in%c("1","I")){
        countMultiple <- dat[,uniqueN(strata_i), by=c(cluster[i]),
                             env = list(strata_i = strata[i])]
        countMultiple <- countMultiple[V1>1]
        if(nrow(countMultiple)>0){
          stop("Some sampling units in ",cluster[i]," occur in multiple strata of ",strata[i])
        }
      }
    }
    
  }
  
  # check if variable f, N, n are in data.table
  overwrite.names <- c("f", "N", "n", "n_prev", "n_draw", "n_draw_prev") 
  overwrite.names <- overwrite.names[overwrite.names %in% colnames(dat)]
  if (length(overwrite.names) > 0) {
    overwrite.names.new <- paste0("ORIGINAL_", overwrite.names)
    setnames(dat, overwrite.names, overwrite.names.new)
    
    strata[strata %in% overwrite.names] <-
      overwrite.names.new[strata %in% overwrite.names]
    cluster[cluster %in% overwrite.names] <-
      overwrite.names.new[cluster %in% overwrite.names]
    fpc[fpc %in% overwrite.names] <-
      overwrite.names.new[fpc %in% overwrite.names]
  }
  
  # set index for data to return dat in correct order
  dat[, InitialOrder := .I] 
  
  # calculate bootstrap replicates
  stages <- length(strata) 
  
  n <- nrow(dat)
  
  # define values for final calculation
  n.calc <- matrix(0, nrow = n, ncol = stages) # n = sample size
  N.calc <- matrix(0, nrow = n, ncol = stages) #  N = total population
  n_draw.calc <- matrix(0, nrow = n, ncol = stages)  # n_draw = draws in different satges
  delta.calc <- array(0, dim = c(n, stages, REP)) 
  
  delta_selection <- list()
  
  for (i in 1:stages) {  
    
    # define by.val
    by.val <- strata[i]  # by.val determines the grouping for the current level
    if (i > 1) {
      by.val <- c(strata[1:(i - 1)], cluster[1:(i - 1)], strata[i]) 
    }
    by.val <- by.val[!by.val %in% c("1", "I")] # filter out placeholders
    
    # define cluster value
    clust.val <- cluster[i]
    if (clust.val %in% c("1", "I")) {
      clust.val <- paste0("ID_help_", i)  # creates helper ids if no real cluster is defined, sampling takes place at individual level (because a cluster definition is always required)
      dat[, c(clust.val) := .I, by = c(by.val)]
    }
    
    # check of fpc[i] is well defined
    check.fpc <- dat[,uniqueN(fpc_i), by=c(by.val), 
                     env = list(fpc_i = fpc[i])]  
    check.fpc <- check.fpc[V1>1]
    if (nrow(check.fpc) > 0) {
      stop("values in ", fpc[i], " do vary in some strata-cluster ",
           "combinations at sampling stage ", i)
    }
    
    singles <- dat[fpc_i>1, sum(!duplicated(clust.val)), by=c(by.val),   # finds groups in which only a single psu is present -> this is problematic for bootstrap (cannot be resampled with only one element)
                   env = list(fpc_i = fpc[i],
                              clust.val = clust.val)]
    singles <- singles[V1==1]
    if (nrow(singles) > 0) {
      higher.stages <- c(strata[1:(i - 1)], cluster[1:(i - 1)])
      by.val.tail <- tail(by.val,1)
      firstStage <- length(higher.stages) == 0
      if (firstStage) {
        higher.stages <- by.val
      }
      singles <- unique(subset(singles, select = higher.stages))
      
################ SINGLE BEGIN ################ ##############################################################################################
      
      if (method == "Preston") {
        if (single.PSU == "merge") { # strata with only one PSU are merged with the nearest smaller stratum to avoid issues in the bootstrap procedure.
          
          if (return.value == "data") {
            by.val.orig <- paste0(by.val.tail, "_ORIGINALSINGLES")
            fpc_orig <- paste0(fpc[i], "_ORIGINALSINGLES")
            
            set(dat, i = by.val.orig, value = dat[[by.val.tail]])
            set(dat, i = fpc_orig, value = dat[[fpc[i]]])
          }
          
          setkeyv(dat, higher.stages)
          next.PSU <- dat[singles, .(N = sum(!duplicated(clust.val))), by = c(by.val),
                          on = c(higher.stages),
                          env = list(clust.val = clust.val)]
          
          if(nrow(next.PSU)==1){
            stop("Only 1 single PSU in first sampling stage! Cannot combine single PSU.\nPlease manually combine sampling STRATA to eliminate single PSU!")
          }
          
          new.var <- paste0(tail(by.val, 1), "_NEWVAR")
          
          next.PSU[,c(new.var) := next_minimum(N, by.val.tail), by = c(higher.stages),
                   env = list(by.val.tail = by.val.tail)]
          if(any(is.na(next.PSU[[new.var]]))){
            if(firstStage){
              next.PSU[is.na(new.var_col), c(new.var) := head(higher.stages, 1),      
                       env = list(new.var_col = new_var,
                                  higher.stages = higher.stages)]
            }else{
              next.PSU[is.na(new.var_col), c(new.var) := head(by.val.tail,1),      
                       env = list(new.var_col = new_var,
                                  by.val.tail = by.val.tail)]
            }
          }
          
          # select singel PSUs and PSUs to join with
          next.PSU <- next.PSU[N == 1 | new.var == by.val.tail,
                               env = list( new.var = new.var,
                                           by.val.tail = by.val.tail)]
          dat <- merge(dat, next.PSU[, mget(c(by.val, new.var))],
                       by = c(by.val), all.x = TRUE)
          
          # sum over margins
          fpc.i_ADD <- paste0(fpc[i], "_ADD")
          dat[!is.na(new.var), c(fpc.i_ADD) := 
                sum(fpc_i[!duplicated(by.val.tail)]), by = c(new.var),
              env = list(new.var = new.var,
                         fpc_i = fpc[i],
                         by.val.tail = by.val.tail)]
          
          # assign to new group
          dat[!is.na(new.var), c(by.val.tail) := new.var, 
              env = list(new.var = new.var)]
          dat[ c(fpc[i]) := fpc_i[is.na(new.var)][1], by=c(by.val),
               env = list(new.var = new.var)]
          dat[!is.na(new.var), c(fpc[i]) := fpc.i_ADD,
              env = list(new.var = new.var,
                         fpc.i_ADD = fpc.i_ADD)]
          
          dat[, c(new.var, paste0(fpc[i], "_ADD")) := NULL]
        } 
        
        else if (single.PSU == "mean") {  # if single = mean: Marks the affected observations with a SINGLE_BOOT_FLAG in order to simply take the average of other replicates later in the bootstrap process.
          
          singles[, SINGLE_BOOT_FLAG := paste(higher.stages, .GRP, sep = "-"),  
                  by = c(higher.stages)]
          
          dat <- merge(dat, singles, by = c(higher.stages), all.x = TRUE)
          if (!"SINGLE_BOOT_FLAG_FINAL" %in% colnames(dat)) {
            dat[, SINGLE_BOOT_FLAG_FINAL := SINGLE_BOOT_FLAG]
          } else {
            dat[is.na(SINGLE_BOOT_FLAG_FINAL),
                SINGLE_BOOT_FLAG_FINAL := SINGLE_BOOT_FLAG]
          }
          dat[, SINGLE_BOOT_FLAG := NULL]
          
        } 
        
        else {
          message("Single PSUs detected at the following stages:\n")
          dat.print <- dat[,sum(!duplicated(clust.val)), by=c(by.val),
                           env = list(clust.val = clust.val)]
          dat.print <- dat.print[V1==1,.SD,.SDcols=c(by.val)]
          print(dat.print)
        }
        
      } else if (method == "Rao-Wu") {
        if (single.PSU == "merge") {
          
          if (return.value == "data") {
            by.val.orig <- paste0(by.val.tail, "_ORIGINALSINGLES")
            fpc_orig <- paste0(fpc[i], "_ORIGINALSINGLES")
            
            set(dat, i = by.val.orig, value = dat[[by.val.tail]])
            set(dat, i = fpc_orig, value = dat[[fpc[i]]])
          }
          
          setkeyv(dat, higher.stages)
          next.PSU <- dat[singles, .(N = sum(!duplicated(clust.val))), by = c(by.val),
                          on = c(higher.stages),
                          env = list(clust.val = clust.val)]
          
          if(nrow(next.PSU)==1){
            stop("Only 1 single PSU in first sampling stage! Cannot combine single PSU.\nPlease manually combine sampling STRATA to eliminate single PSU!")
          }
          
          new.var <- paste0(tail(by.val, 1), "_NEWVAR")
          
          next.PSU[,c(new.var) := next_minimum(N, by.val.tail), by = c(higher.stages),
                   env = list(by.val.tail = by.val.tail)]
          if(any(is.na(next.PSU[[new.var]]))){
            if(firstStage){
              next.PSU[is.na(new.var_col), c(new.var) := head(higher.stages, 1),
                       env = list(new.var_col = new_var,
                                  higher.stages = higher.stages)]
            }else{
              next.PSU[is.na(new.var_col), c(new.var) := head(by.val.tail,1),
                       env = list(new.var_col = new_var,
                                  by.val.tail = by.val.tail)]
            }
          }
          
          # select singel PSUs and PSUs to join with
          next.PSU <- next.PSU[N == 1 | new.var == by.val.tail,
                               env = list( new.var = new.var,
                                           by.val.tail = by.val.tail)]
          dat <- merge(dat, next.PSU[, mget(c(by.val, new.var))],
                       by = c(by.val), all.x = TRUE)
          
          # sum over margins
          fpc.i_ADD <- paste0(fpc[i], "_ADD")  
          dat[!is.na(new.var), c(fpc.i_ADD) := 
                sum(fpc_i[!duplicated(by.val.tail)]), by = c(new.var),
              env = list(new.var = new.var,
                         fpc_i = fpc[i],
                         by.val.tail = by.val.tail)]
          
          # assign to new group
          dat[!is.na(new.var), c(by.val.tail) := new.var, 
              env = list(new.var = new.var)]
          dat[ c(fpc[i]) := fpc_i[is.na(new.var)][1], by=c(by.val),
               env = list(new.var = new.var)]
          dat[!is.na(new.var), c(fpc[i]) := fpc.i_ADD,
              env = list(new.var = new.var,
                         fpc.i_ADD = fpc.i_ADD)]
          
          dat[, c(new.var, paste0(fpc[i], "_ADD")) := NULL]
        } 
        
        else if (single.PSU == "mean") {
          
          dat[!is.na(SINGLE_BOOT_FLAG_FINAL), c(bootRep) := lapply( 
            .SD,
            function(z) {  # Ensure that the single PSU is pulled at least once
              if (all(is.na(z))) {
                avg <- mean(z, na.rm = TRUE)
                
                if (is.na(avg)) avg <- 0  # If all values of a replica column are NA, a default value (e.g. 0) is used -> Rao-Wu method is designed to deal with uncertain data
                return(rep(avg, length(z)))  
              } else {
                return(z)
              }
            }
          ), by = SINGLE_BOOT_FLAG_FINAL, .SDcols = c(bootRep)]
        }
        else {
          message("Single PSUs detected at the following stages:\n")
          dat.print <- dat[,sum(!duplicated(clust.val)), by=c(by.val),
                           env = list(clust.val = clust.val)]
          dat.print <- dat.print[V1==1,.SD,.SDcols=c(by.val)]
          print(dat.print)
          
        }
        
      }
    }
################ SINGLE END ################ ##############################################################################################
    
    # get Stage
    if (i == 1) { 
      dati <- dat[,.(N=fpc_i[1], 
                     clust.val = unique(clust.val),
                     f = 1, 
                     n_prev = 1, n_draw_prev = 1, 
                     sum_prev = 1),  
                  by = c(by.val),
                  env = list(fpc_i = fpc[i],
                             clust.val = clust.val)]
      
    } else { # In level 2 or higher: The information from the previous draw is now transferred
      dati <- dat[,.(N=fpc_i[1], 
                     clust.val = unique(clust.val),
                     f = f[1], 
                     n_prev = n_prev[1], 
                     n_draw_prev = n_draw_prev, 
                     sum_prev = sum_prev[1]),
                  by = c(by.val),
                  env = list(fpc_i = fpc[i],
                             clust.val = clust.val)]
    }
    
    deltai <- paste0("delta_", i, "_", 1:REP) 
    dati[, n := .N, by = c(by.val)]  
    
    if (method == "Preston") {
      dati[,n_draw:=floor(n/2),by = c(by.val)]   
    } else {
      dati[, n_draw := n - 1, by = c(by.val)] 
    }
    
    
    if (nrow(dati[n_draw == 0]) > 0) {  
      stop("Resampling 0 PSUs should not be possible! Please report bug in ",
           "https://github.com/statistikat/surveysd")
    }
    
    # do bootstrap for i-th stage
    if (!is.null(already.selected)) { 
      # if already.selected was supplied, consider already selected units
      dati_selected <- already.selected[[i]]
      on_cols <- c(by.val, clust.val)[c(by.val, clust.val) %in% colnames(dati_selected)]
      dati[dati_selected, c(deltai) := mget(deltai), on = c(on_cols)]
      
      dati[, c(deltai) := lapply(.SD, function(delta, n, n_draw) {
        
        # Handle method-specific sampling
        if (method == "Rao-Wu") {  
          (draw.with.replacement(n[1], n_draw[1], delta = delta))
        } else if (method == "Preston") {
          (draw.without.replacement(n[1], n_draw[1], delta = delta))
        } else {
          stop("Invalid method specified: ", method)
        }
      }, n = n, n_draw = n_draw), by = c(by.val), .SDcols = c(deltai)]
      
      
      dati_check <- dati[, lapply(.SD, function(z, n_draw) {
        sum(z) == n_draw[1]
        
        if (length(z) == 0 || is.null(z)) {
          print("z ist leer oder NULL!")
          return(FALSE)  # Gebe FALSE zurück, um den Fehler zu vermeiden
        }
        
        # Prüfe die Bedingung
        result <- sum(z, na.rm = TRUE) == n_draw[1]
        # print(paste("Result:", result))
        
        return(as.logical(result))
        
        
      }, n_draw = n_draw), by = c(by.val), .SDcols = c(deltai)]
      
      if (any(!dati_check[, .SD, .SDcols = c(deltai)])) {
        stop("Wrong number of units selected! Please report bug in ",
             "https://github.com/statistikat/surveysd")
      }
      
    } else {
      # Do sampling based on the specified method
      if (method == "Preston") {
        
        # Sampling without replacement
        dati[, c(deltai) := as.data.table(
          replicate(REP, draw.without.replacement(n[1], n_draw[1]), simplify = FALSE)
        ), by = c(by.val)]
      } else if (method == "Rao-Wu") {
        print("Debugging draw.without.replacement:")
        print(paste("n:", n))
        print(paste("n_draw:", n_draw))
        
        # Sampling with replacement
        dati[, c(deltai) := as.data.table(
          replicate(REP, draw.with.replacement(n[1], n_draw[1]), simplify = FALSE)
        ), by = c(by.val)]
      } else {
        stop("Invalid method specified. Please use 'Preston' or 'Rao-Wu'.")
      }
    }
    
    # merge with data
    if(i>1){ 
      dat[,c("f","n_prev","n_draw_prev","sum_prev"):=NULL]
    }
    
    dat <- merge(dat,dati,by=c(by.val,clust.val)) 
    setorder(dat,InitialOrder) 
    
    # prepare output for return.value %in% "selection"
    keep_cols <- c(cluster, strata, deltai)
    keep_cols <- keep_cols[keep_cols %in% colnames(dat)]
    delta_selection <- c(delta_selection,
                         list(subset(dat, select = keep_cols)))
    
    # only matrices and arrays needed for final calculation
    n.calc[, i] <- dat[, n]
    N.calc[, i] <- dat[, N]
    n_draw.calc[, i] <- dat[, n_draw]
    delta.calc[, i, ] <- as.matrix(dat[, mget(deltai)])
    
    dat[, f := n / N * f] 
    dat[, n_prev := n * n_prev]
    dat[, n_draw_prev := n_draw * n_draw_prev]
    
    dat[, c("n", "N", deltai, "n_draw") := NULL] 
    
    rm(dati);gc() 
  }
  
  dat[,c("f","n_prev","n_draw_prev","sum_prev"):=NULL]
  
  names(delta_selection) <- paste0("SamplingStage",1:stages)
  
  bootRep <- paste0("bootRep", 1:REP) 
  
  dat[, c(bootRep) := as.data.table(calc.replicate( # call function calc.replicate -> calculate bootstrap replicate for each repetition
    method = method,
    n = n.calc, 
    N = N.calc, 
    n_draw = n_draw.calc, 
    delta = delta.calc))] 
  
  if (single.PSU == "mean") {
    dat[!is.na(SINGLE_BOOT_FLAG_FINAL), c(bootRep) := lapply(  
      .SD,
      function(z) {
        mean(z, na.rm = TRUE)
      }
    ), by = SINGLE_BOOT_FLAG_FINAL, .SDcols = c(bootRep)]
  }
  
  setkey(dat, InitialOrder)  
  if (length(removeCols) > 0) {
    dat[, c(removeCols) := NULL]
  }
  
  if ("data" %in% return.value) {
    # get original values for PSUs and fpc - if singles PSUs have been detected and merged
    if (single.PSU == "merge") {  
      c.names <- colnames(dat)
      c.names <- c.names[grepl("_ORIGINALSINGLES", c.names)]
      if (length(c.names) > 0) {
        drop.names <- gsub("_ORIGINALSINGLES", "", c.names)
        dat[, c(drop.names) := NULL]
        setnames(dat, c(c.names), drop.names)
      }
    }
    dat[, c("InitialOrder") := NULL]  

    if (length(overwrite.names) > 0) {
      setnames(dat, overwrite.names.new, overwrite.names)
    }
    
    output_bootstrap <- list(data = dat)
  } else if ("replicates" %in% return.value) {
    output_bootstrap <- list(replicates = dat[, mget(bootRep)]) 
  }
  
  if("selection" %in% return.value){
    if(length(return.value)>1){
      output_bootstrap <- c(output_bootstrap, list(selection = delta_selection))
    }else{
      output_bootstrap <- list(selection = delta_selection)
    }
  }
  
  if(length(output_bootstrap) == 1){
    output_bootstrap <- output_bootstrap[[1]]
  }
  return(output_bootstrap)
}



# Helper Functions 

select.nstar <- function(n, N, f, n_prev, n_draw_prev, lambda_prev,
                         sum_prev = NULL, new.method) {
  if (n == 1) {
    return(1L)
  }
  
  if (!is.null(sum_prev)) {
    n_draw <- (sum_prev) ^ 2 / ((1 - (n / N)) * n_prev * f +
                                  (sum_prev) ^ 2) * n
    n_draw <- floor(n_draw)
  } else {
    if (new.method) {
      n_draw <- (n * n_draw_prev) / (n_prev * f * (1 - n / N) + n_draw_prev)
      n_draw <- floor(n_draw)
    } else {
      n_draw <- floor(n / 2)
    }
  }
  
  return(n_draw) 
}

# Sampling WITHOUT replacement (Preston method)
draw.without.replacement <- function(n, n_draw, delta = NULL) {   
  
  # if no units have been selected prior
  if(is.null(delta)){
    delta <- rep(c(1.0, 0.0), c(n_draw, n - n_draw)) 
    if (length(delta) > 1) {
      delta <- sample(delta)
    }
  }else{
    # if units have already been selected
    n_selected <- sum(delta,na.rm=TRUE)  
    delta_new_unit <- is.na(delta) 
    
    # check if only 1 unit needs to be selected
    if(sum(delta_new_unit)==1){
      if(n_selected<n_draw){
        delta <- set.random2NA(delta,value2NA = 0)  
      }else{
        delta <- set.random2NA(delta,value2NA = 1) 
      }
      
      n_selected <- sum(delta,na.rm=TRUE)
      delta_new_unit <- is.na(delta)
    }
    
    if(n_draw<=n_selected){
      add1 <- as.numeric(sum(delta_new_unit)>1) 
      nChanges <- n_selected - n_draw + add1
      
      if(nChanges == sum(delta_new_unit==FALSE)){ 
        delta <- set.random2NA(delta, value2NA=1,
                               nChanges = n_selected - n_draw + add1) 
      }else{
        delta <- change.random.value(delta, changeVal=1,
                                     nChanges = n_selected - n_draw + add1)
      }
      delta_new_unit <- is.na(delta)
      n_selected <- sum(delta,na.rm=TRUE)
    }
    
    if((n_draw-n_selected) >= sum(delta_new_unit)){
      add1 <- as.numeric(sum(delta_new_unit)>1)
      nChanges <- n_draw-n_selected-sum(delta_new_unit) + add1
      delta <- change.random.value(delta, changeVal=0,
                                   nChanges = nChanges)
      n_selected <- sum(delta,na.rm=TRUE)
      delta_new_unit <- is.na(delta)
    }
    
    n_draw <- n_draw - n_selected
    delta_rest <- rep(c(1.0, 0.0), c(n_draw, sum(delta_new_unit)-n_draw))
    if (length(delta_rest) > 1) {
      delta_rest <- sample(delta_rest)
    }
    delta[delta_new_unit==TRUE] <- delta_rest
    
  }
  return(delta)
}


# Sampling WITH replacement (Rao-Wu method)
draw.with.replacement <- function(n, n_draw, delta = NULL) { 
  
  if (is.null(delta)) {
    delta <- rep(0, n)  # Set all units to 0 (not yet drawn)
    delta[is.na(delta)] <- 0
  }
  
  if (is.na(n_draw) || n_draw > n) {
    stop("Error: Invalid n_draw value.")
  }
  
  n_selected <- sum(delta, na.rm = TRUE)
  
  # 1. draw additional units if less than n_draw are selected
  if (n_selected < n_draw) {
    n_needed <- n_draw - n_selected
    additional_draws <- sample(1:n, size = n_needed, replace = TRUE)
    
    for (i in additional_draws) {
      if (is.na(delta[i])) {
        delta[i] <- 1  
      } else {
        delta[i] <- delta[i] + 1  
      }
    }
  }
  
  # 2 Reduce excess units if more than n_draw are selected
  if (n_selected > n_draw) {
    over_selected <- n_selected - n_draw

    if (over_selected > 0) {
      indices <- which(delta > 0)
      if (length(indices) >= over_selected) {
        delta[indices[1:over_selected]] <- delta[indices[1:over_selected]] - 1
      } else {
        stop("Error: Too many units to remove from delta.")
      }
    }
  }
  
  # 3. handle NA values in delta, if available
  if (any(is.na(delta))) {
    delta[is.na(delta)] <- 0   
  }
  
  return(delta)
}


change.random.value <- function(delta,changeVal=0,nChanges=1){
  
  changeVal2 <- fifelse(changeVal==0,1,0) 
  set2NA <- which(delta==changeVal & !is.na(delta))
  if(length(set2NA)>1){
    set2NA <- sample(set2NA,min(nChanges,length(set2NA)))
  }
  delta[set2NA] <- changeVal2
  return(delta)
}



set.random2NA <- function(delta,value2NA=0,nChanges=1){
  
  set2NA <- which(delta==value2NA & !is.na(delta))
  if(length(set2NA)>1){
    set2NA <- sample(set2NA,min(nChanges,length(set2NA)))
  }
  delta[set2NA] <- NA
  return(delta)
}


calc.replicate <- function(n, N, n_draw, delta , method = "Preston") {
  
  p <- ncol(n)  
  dimdelta <- dim(delta)
  
  for (i in 1:p) {
    if (method == "Preston") {
      if (i == 1) { # 
        lambda <- sqrt(n_draw[, 1] *
                         (1 - n[, 1] / N[, 1]) / (n[, 1] - n_draw[, 1]))
        rep_out <- 1 - lambda + lambda * n[, i] / n_draw[, i] * delta[, i, ] 
      } else if (i == 2) {
        lambda <- (1 - n[, i] / N[, i]) / (n[, i] - n_draw[, i])
        lambda <- sqrt((n[, i - 1] / N[, i - 1]) * n_draw[, i] * lambda)
        rep_out <- rep_out + lambda *                                       
          (sqrt(n[, i - 1] / n_draw[, i - 1]) * delta[, i - 1, ]) *
          (n[, i] / n_draw[, i] * delta[, i, ] - 1)
      } else {
        lambda <- (1 - n[, i] / N[, i]) / (n[, i] - n_draw[, i])
        lambda <- sqrt(rowProds(n[, 1:(i - 1)] / N[, 1:(i - 1)]) *
                         n_draw[, i] * lambda)
        prod_val <- matrix(0, ncol = dimdelta[3], nrow = dimdelta[1])
        for (r in 1:dimdelta[3]) {
          prod_val[, r] <- rowProds(sqrt(n[, 1:(i - 1)] / n_draw[, 1:(i - 1)]) *
                                      delta[, 1:(i - 1), r])
          
        }
        
        rep_out <- rep_out + lambda * prod_val * (n[, i] / n_draw[, i] *      
                                                    delta[, i, ] - 1)
      }
      
    } else if (method == "Rao-Wu") {
      
      n_h <- n[, i]           
      m_h <- n_h - 1        
      f_h <- n_h / N[, i]     
      
      if (any(f_h > 0.1)) {
        warning("Sampling Fraction is too big for method = 'Rao-Wu', choose method = 'Preston' instead")
      }
      
      w_hi <- N[, i] / n_h 
      
      # Scaling factor λ determines the variance correction
      lambda <- sqrt(m_h * (1 - f_h) / (n_h - 1))
      
      # r_hi_star: Create replicas by drawing (n_h - 1) units without putting them back
      r_hi_star <- delta[, i, ]
      w_hi_star <- (1 - lambda + lambda * (n_h / m_h) * r_hi_star) #* w_hi
      
      if (i == 1) {
        rep_out <- w_hi_star
      } else {
        rep_out <- rep_out * w_hi_star  
      }
      
    }
  }
  return(rep_out)
}


next_minimum <- function(N, by) {
  N_notOne <- N != 1
  by <- by[N_notOne][which.min(N[N_notOne])]
  if (length(by) > 1) {
    by <- sample(by, 1)
  }
  return(by)
}

Try the surveysd package in your browser

Any scripts or data that you put into this service are public.

surveysd documentation built on Aug. 26, 2025, 5:08 p.m.