R/direct_estimation.R

Defines functions direct

Documented in direct

#' Direct estimation of disaggregated indicators
#'
#' Function \code{direct} estimates indicators only based on sample information. 
#' The variance is estimated via a naive or calibrated bootstrap. The estimation
#' is adapted from the estimation of direct indicators in package 
#' \pkg{laeken}. 
#'
#' @param y a character string indicating the variable that is used for 
#' estimating the indicators. The variable must be contained in the sample data.
#' @param smp_data survey data containing variable y as well as sampling
#' domains, and weights if selected.
#' @param smp_domains a character containing the name of a variable
#' that indicates domains in the sample data. The variable must be numeric or a
#' factor.
#' @param weights a character string containing the name of a variable for 
#' the sampling weights in the sample data. This argument is optional and defaults
#' to \code{NULL}. 
#' @param design a character string containing the name of a variable for different 
#' strata for stratified sampling designs. This argument is optional and defaults
#' to \code{NULL}. 
#' @param threshold a number defining a threshold. Alternatively, a threshold may 
#' be defined as a \code{function} of \code{y} and \code{weights} returning a 
#' numeric value. Such a function will be evaluated once for the point 
#' estimation and in each iteration of the parametric bootstrap. See Example 2 
#' for using a function as threshold.  A threshold is needed for calculation e.g. 
#' of head count ratios and poverty gaps. The argument defaults to \code{NULL}. 
#' In this case the threshold is set to 60\% of the median of the variable that 
#' is selected as \code{y} similary to the at-risk-of-poverty rate used in the 
#' EU (see also \cite{Social Protection Committee 2001}). However, any desired 
#' threshold can be chosen.
#' @param var if \code{TRUE}, estimates for the variance are calculated using a 
#' naive or calibrated bootstrap. Defaults to \code{FALSE}.
#' @param boot_type a character string containing the name of the bootstrap 
#' specification. Either a \code{"naive"} or a \code{"calibrate"} bootstrap can 
#' be used. See also \code{\link[laeken]{bootVar}}. Defaults to \code{naive}.
#' @param B a number determining the number of bootstrap populations for the 
#' bootstrap variance. Defaults to \code{50}.
#' @param seed an integer to set the seed for the random number generator. Random 
#' number generation is used in the bootstrap approach. If seed is set to 
#' \code{NULL}, seed is chosen randomly. Defaults to \code{123}.
#' @param X_calib a numeric matrix including calibration variables if the calibrated 
#' bootstrap is chosen. Defaults to NULL.
#' @param totals a numeric vector providing the population totals if the calibrated 
#' bootstrap is chosen. If a vector is chosen, the length of the vector needs to 
#' equal the number of columns in X_calib. Defaults to \code{NULL}. In this case, 
#' the sampling weights are used to calculate the totals.
#' @param custom_indicator a list of functions containing the indicators to be
#' calculated additionally. Such functions must and must only depend on the
#' target variable \code{y}, the \code{weights} and the 
#' \code{threshold} (numeric value) (see Example 3) even though some arguments 
#' might not be used in the additional function. Defaults to \code{NULL}.
#' @param na.rm if \code{TRUE}, observations with \code{NA} values are deleted 
#' from the sample data. Defaults to \code{FALSE}. 
#' @return An object of class "emdi" that provides direct estimators for regional
#' disaggregated indicators and optionally corresponding variance estimates. Generic
#' functions such as \code{\link{estimators}}, \code{\link{print}} 
#' and \code{\link{summary}} have methods that can be used
#' to obtain further information. See \code{\link{emdiObject}} for descriptions
#' of components of objects of class "emdi".
#' @details The set of predefined indicators includes the mean, median, four 
#' further quantiles (10\%, 25\%, 75\% and 90\%), head count ratio, poverty gap, 
#' Gini coefficient and the quintile share ratio. 
#' @references
#' Alfons, A. and Templ, M. (2013). Estimation of Social Exclusion Indicators
#' from Complex Surveys: The \R Package \pkg{laeken}. Journal of 
#' Statistical Software, 54(15), 1-25.  \cr \cr
#' Social Protection Committee (2001). Report on Indicators in the Field of
#' Poverty and Social Exclusions, Technical Report, European Union.
#' @seealso \code{\link{emdiObject}}, \code{\link[nlme]{lme}},
#' \code{\link{estimators.emdi}}, \code{\link{print.emdi}}, 
#' \code{\link{summary.emdi}}
#' @examples
#' \dontrun{
#' # Loading sample data
#' data("eusilcA_smp")
#'
#' # Example 1: Without weights and naive bootstrap
#' emdi_direct <- direct(y = "eqIncome", smp_data = eusilcA_smp, 
#' smp_domains = "district", weights = "weight", threshold = 11064.82, var = TRUE, 
#' boot_type = "naive", B = 50, seed = 123, X_calib = NULL, totals = NULL, 
#' na.rm = TRUE)
#' 
#' # Example 2: With function as threshold
#' emdi_direct <- direct(y = "eqIncome", smp_data = eusilcA_smp, 
#' smp_domains = "district", weights = "weight", threshold = 
#' function(y, weights){0.6 * laeken::weightedMedian(y, weights)}, na.rm = TRUE)
#' 
#' # Example 3: With custom indicators
#' emdi_direct <- direct(y = "eqIncome", smp_data = eusilcA_smp, 
#' smp_domains = "district", weights = "weight", threshold = 10859.24, 
#' var = TRUE, boot_type = "naive", B = 50, seed = 123, X_calib = NULL, 
#' totals = NULL, custom_indicator = list(my_max = function(y, weights, 
#' threshold){max(y)}, my_min = function(y, weights, threshold){min(y)}), 
#' na.rm = TRUE)
#' }
#' @export
#' @importFrom boot boot
#' @importFrom MASS ginv
#' @importFrom stats aggregate runif weighted.mean


direct <- function(y, 
                   smp_data, 
                   smp_domains, 
                   weights = NULL, 
                   design = NULL,
                   threshold = NULL,
                   var = FALSE, 
                   boot_type = "naive", 
                   B = 50,
                   seed = 123,
                   X_calib = NULL, 
                   totals = NULL,
                   custom_indicator = NULL,
                   na.rm = FALSE){
  
  
  
  direct_check(y = y, smp_data = smp_data, smp_domains = smp_domains, 
                weights = weights, design = design, threshold = threshold, 
                var = var, boot_type = boot_type, 
                B = B, seed = seed, X_calib = X_calib, totals = totals, 
                na.rm = na.rm, custom_indicator = custom_indicator)
  
  # Save call ------------------------------------------------------------------
  call <- match.call()
  
  # Data manipulation and notational framework ---------------------------------
  
  # The function framework_dir can be found in script framework_direct.R
  framework <- framework_dir(y = y,
                             smp_data = smp_data, 
                             smp_domains = smp_domains, 
                             weights = weights, 
                             threshold = threshold, 
                             custom_indicator = custom_indicator,
                             na.rm = na.rm)
  
  result_point <- mapply( FUN = point_estim_direct, 
                          direct_estimator =  framework$indicator_list,
                          indicator_name =  framework$indicator_names,
                          MoreArgs = list(
                                framework = framework,  
                                boot_type = boot_type, 
                                B = B,
                                seed = seed,
                                X_calib = X_calib, 
                                totals = totals
                          ),
                  SIMPLIFY = F
  )

  warnlist <- character()
  if (var == TRUE) {
    envir <- environment()
    res <- mapply(FUN = direct_variance,
                  direct_estimator =  framework$indicator_list,
                  indicator_name =  framework$indicator_names,
                  indicator = result_point,
                  MoreArgs = list(
                           y = framework$y, 
                           weights = framework$weights, 
                           smp_data = framework$smp_data,  
                           smp_domains = framework$smp_domains_vec,
                           design = design,
                           bootType = boot_type, 
                           B = B,
                           seed = seed,
                           X_calib = X_calib, 
                           totals = totals,
                           threshold = framework$threshold,
                           envir = envir
                           ),
                  SIMPLIFY = F
    )
    if (length(warnlist) > 0) {
      warning(paste0("For the following domains at least one bootstrap failed ",
                     ", this may be due to a very small sample size. For these domains ",
                     "the variance estimation is based on a reduced number of ",
                     "bootstrap iteration. To see the number of successful" ,
                     " bootstrap iterations for each domain and indicator, have a look at",
                     " the value successful_bootstraps in the returned object."))
    }
  } else {
    res <- result_point
  }
  ####### Putting together the emdi object
  

  ind <- cbind(res[[1]]$valueByDomain$Domain, 
              as.data.frame(lapply(res, function(erg){erg$valueByDomain[,2]}))
  )
  if (!var) {
    MSE <- NULL 
    colnames(ind) <- c("Domain", framework$indicator_names)
    } else {
      MSE <- cbind(res[[1]]$varByDomain$Domain, 
              as.data.frame(lapply(res, function(erg){erg$varByDomain[,2]}))
              )
      colnames(MSE) <- colnames(ind) <- c("Domain", framework$indicator_names)
    }
  
  if (length(warnlist) > 0) {
    failed_BS <- do.call(rbind, lapply(strsplit(warnlist, split = ":_:"), t))
    colnames(failed_BS) <- c("Domain", "Indicator")
    failed_BS <- as.data.frame(failed_BS)
    failed_BS$Domain <- factor(failed_BS$Domain, 
                               levels = levels(framework$smp_domains_vec))
    failed_BS$Indicator <- factor(failed_BS$Indicator, 
                                  levels = framework$indicator_names)
    sucInd <- B - table(failed_BS$Domain, failed_BS$Indicator)
  }
  else {
    failed_BS <- matrix(data = 0, 
                        nrow = length(levels(framework$smp_domains_vec)),
                        ncol = length(framework$indicator_names),
                        dimnames = list(levels(framework$smp_domains_vec), 
                                        framework$indicator_names)
                        )
    sucInd <- B - failed_BS
    
  }
  
   direct_out <- list(
    ind = ind, 
    MSE = MSE,
    framework = framework[c("N_dom_smp",
                         "N_smp",
                         "smp_domains",
                         "smp_domains_vec")],
    call = call,
    successful_bootstraps = sucInd
    )
  
  class(direct_out) <- c("emdi", "direct")
  return(direct_out)
}
SoerenPannier/emdi documentation built on Oct. 11, 2018, 5:30 p.m.