R/linarpr.R
In vardpoor: Variance Estimation for Sample Surveys by the Ultimate Cluster Method

Documented in linarpr

#' Linearization of at-risk-of-poverty rate
#' 
#' @description Estimates the at-risk-of-poverty rate (defined as the proportion of persons with equalized disposable income below at-risk-of-poverty threshold) and computes linearized variable for variance estimation.
#' 
#' @param Y Study variable (for example equalized disposable income). One dimensional object convertible to one-column \code{data.table} or variable name as character, column number).
#' @param id Optional variable for unit ID codes. One dimensional object convertible to one-column \code{data.table} or variable name as character, column number or logical vector).
#' @param weight Optional weight variable. One dimensional object convertible to one-column \code{data.table} or variable name as character, column number or logical vector).
#' @param Y_thres Variable (for example equalized disposable income) used for computation and linearization of poverty threshold. One dimensional object convertible to one-column \code{data.table} or variable name as character, column number. Variable specified for \code{inc} is used as \code{income_thres} if \code{income_thres} is not defined.
#' @param wght_thres Weight variable used for computation and linearization of poverty threshold. One dimensional object convertible to one-column \code{data.table} or variable name as character, column number or logical vector. Variable specified for \code{weight} is used as \code{wght_thres} if \code{wght_thres} is not defined.
#' @param sort Optional variable to be used as tie-breaker for sorting. One dimensional object convertible to one-column \code{data.table} or variable name as character, column number.
#' @param Dom Optional variables used to define population domains. If supplied, linearization of at-risk-of-poverty threshold is done for each domain. An object convertible to \code{data.table} or variable names as character vector, column numbers as numeric vector.
#' @param period Optional variable for survey period. If supplied, linearization of at-risk-of-poverty threshold is done for each survey period. Object convertible to \code{data.table} or variable names as character, column numbers as numeric vector.
#' @param dataset Optional survey data object convertible to \code{data.table}.
#' @param percentage A numeric value in range \eqn{\left[ 0,100 \right]}{[0,100]} for \eqn{p} in the formula for at-risk-of-poverty threshold computation:
#'            \deqn{\frac{p}{100} \cdot Z_{\frac{\alpha}{100}}.}{p/100 * Z(\alpha/100).}
#'        For example, to compute at-risk-of-poverty threshold equal to 60\% of some income quantile, \eqn{p} #'should be set equal to 60.
#' @param order_quant A numeric value in range \eqn{\left[ 0,100 \right]}{[0,100]} for \eqn{\alpha} in the formula #'for at-risk-of-poverty threshold computation:
#'            \deqn{\frac{p}{100} \cdot Z_{\frac{\alpha}{100}}.}{p/100 * Z(\alpha/100).}
#'       For example, to compute at-risk-of-poverty threshold equal to some percentage of median income, \eqn{\alpha} should be set equal to 50.
#' @param var_name A character specifying the name of the linearized variable.
#' @param checking Optional variable if this variable is TRUE, then function checks data preparation errors, otherwise not checked. This variable by default is TRUE.
#'
#' @details The implementation strictly follows the Eurostat definition.
#'
#' @return A list with four objects are returned:
#'        \itemize{
#'        \item \code{quantile} - a \code{data.table} containing the estimated value of the quantile used for at-risk-of-poverty threshold estimation.
#'        \item \code{threshold} - a \code{data.table} containing the estimated at-risk-of-poverty threshold.
#'        \item \code{value} - a \code{data.table} containing the estimated at-risk-of-poverty rate (in percentage).
#'       \item \code{lin} - a \code{data.table} containing the linearized variables of the at-risk-of-poverty rate (in percentage).
#'    }
#'
#' @references
#' Working group on Statistics on Income and Living Conditions (2004) Common cross-sectional EU indicators based on EU-SILC; the gender pay gap. \emph{EU-SILC 131-rev/04}, Eurostat. \cr
#' Guillaume Osier (2009). Variance estimation for complex indicators of poverty and inequality. \emph{Journal of the European Survey Research Association}, Vol.3, No.3, pp. 167-195, ISSN 1864-3361, URL \url{https://ojs.ub.uni-konstanz.de/srm/article/view/369}.  \cr
#' Jean-Claude Deville (1999). Variance estimation for complex statistics and estimators: linearization and residual techniques. Survey Methodology, 25, 193-203, URL \url{https://www150.statcan.gc.ca/n1/pub/12-001-x/1999002/article/4882-eng.pdf}.  \cr
#' 
#'
#' @seealso \code{\link{linarpt}},
#'          \code{\link{varpoord}},
#'          \code{\link{vardcrospoor}},
#'          \code{\link{vardchangespoor}}
#'
#' @keywords Linearization
#'
#' @examples
#' library("data.table")
#' library("laeken")
#' data("eusilc")
#' dataset1 <- data.table(IDd = paste0("V", 1 : nrow(eusilc)), eusilc)
#'     
#' # Full population
#' d <- linarpr(Y = "eqIncome", id = "IDd",
#'              weight = "rb050", Dom = NULL,
#'              dataset = dataset1, percentage = 60,
#'              order_quant = 50L)
#' d$value
#'     
#' \dontrun{
#' # By domains
#' dd <- linarpr(Y = "eqIncome", id = "IDd",
#'               weight = "rb050", Dom = "db040",
#'               dataset = dataset1, percentage = 60,
#'               order_quant = 50L)
#' dd}
#' 
#' @import data.table
#' @import laeken
#' 
#' @export linarpr
        

linarpr <- function(Y, id = NULL, weight = NULL, Y_thres = NULL,
                    wght_thres = NULL, sort = NULL, Dom = NULL,
                    period = NULL, dataset = NULL, percentage = 60,
                    order_quant = 50, var_name = "lin_arpr",
                    checking = TRUE) {

   ## initializations
   if (min(dim(data.table(var_name)) == 1) != 1) {
       stop("'var_name' must have defined one name of the linearized variable")}

   if (checking) {
        percentage <- check_var(vars = percentage, varn = "percentage",
                                varntype = "numeric0100")

        order_quant <- check_var(vars = order_quant, varn = "order_quant",
                                 varntype = "numeric0100")

        Y <- check_var(vars = Y, varn = "Y", dataset = dataset,
                       ncols = 1, isnumeric = TRUE,
                       isvector = TRUE, grepls = "__")
        Ynrow <- length(Y)

        Y_thres <- check_var(vars = Y_thres, varn = "Y_thres",
                             dataset = dataset, ncols = 1,
                             Ynrow = Ynrow, mustbedefined = FALSE,
                             isnumeric = TRUE, isvector = TRUE)

        weight <- check_var(vars = weight, varn = "weight",
                            dataset = dataset, ncols = 1,
                            Ynrow = Ynrow, isnumeric = TRUE,
                            isvector = TRUE)

        wght_thres <- check_var(vars = wght_thres, varn = "wght_thres",
                                dataset = dataset, ncols = 1,
                                Ynrow = Ynrow, mustbedefined = FALSE,
                                isnumeric = TRUE, isvector = TRUE)

        sort <- check_var(vars = sort, varn = "sort",
                          dataset = dataset, ncols = 1,
                          Ynrow = Ynrow, mustbedefined = FALSE,
                          isnumeric = TRUE, isvector = TRUE)

        period <- check_var(vars = period, varn = "period",
                            dataset = dataset, Ynrow = Ynrow,
                            ischaracter = TRUE, mustbedefined = FALSE,
                            duplicatednames = TRUE)

        Dom <- check_var(vars = Dom, varn = "Dom", dataset = dataset,
                         Ynrow = Ynrow, ischaracter = TRUE,
                         mustbedefined = FALSE, duplicatednames = TRUE,
                         grepls = "__")

        id <- check_var(vars = id, varn = "id", dataset = dataset,
                        ncols = 1, Ynrow = Ynrow, ischaracter = TRUE,
                        periods = period)
    }
   dataset <- NULL

   if (is.null(Y_thres)) Y_thres <- Y
   if (is.null(wght_thres)) wght_thres <- weight


   ## computations
   ind0 <- rep.int(1, length(Y))
   period_agg <- period1 <- NULL
   if (!is.null(period)) { period1 <- copy(period)
                           period_agg <- data.table(unique(period))
                       } else period1 <- data.table(ind = ind0)
   period1_agg <- data.table(unique(period1))

   # ARPR by domain (if requested)
   quantile <- incPercentile(Y = Y_thres,
                             weights = wght_thres,
                             sort = sort, Dom = NULL,
                             period = period,
                             k = order_quant,
                             dataset = NULL,
                             checking = FALSE)

    setnames(quantile, names(quantile)[ncol(quantile)], "quantile")
    if (ncol(quantile) > 1) setkeyv(quantile, head(names(quantile), -1))
    threshold <- copy(quantile)
    threshold[, threshold := percentage / 100 * quantile]
    threshold[, quantile := NULL]

    arpr_id <- id
    if (!is.null(period)) arpr_id <- data.table(arpr_id, period)

    if (!is.null(Dom)) {
        Dom_agg <- data.table(unique(Dom))
        setkeyv(Dom_agg, names(Dom_agg))

        arpr_v <- c()
        arpr_m <- copy(arpr_id)
        for(i in 1 : nrow(Dom_agg)) {
              g <- c(var_name, paste(names(Dom), as.matrix(Dom_agg[i,]), sep = "."))
              var_nams <- do.call(paste, as.list(c(g, sep = "__")))
              ind <- as.integer(rowSums(Dom == Dom_agg[i,][ind0,]) == ncol(Dom))

              arprl <- lapply(1 : nrow(period1_agg), function(j) {
                               if (!is.null(period)) {
                                       rown <- cbind(period_agg[j], Dom_agg[i])
                                       setkeyv(rown, names(rown))
                                       rown2 <- copy(rown)
                                       rown <- merge(rown, quantile, all.x = TRUE)
                                     } else {rown <- quantile
                                             rown2 <- Dom_agg[i] }

                               indj <- (rowSums(period1 == period1_agg[j,][ind0,]) == ncol(period1))

                               arpr_l <- arprlinCalc(Y1 = Y[indj],
                                                     ids = arpr_id[indj],
                                                     wght1 = weight[indj],
                                                     indicator = ind[indj],
                                                     Y_thresh = Y_thres[indj],
                                                     wght_thresh = wght_thres[indj],
                                                     percent = percentage,
                                                     order_quants = order_quant,
                                                     quant_val = rown[["quantile"]])
                      list(arpr = data.table(rown2, arpr = arpr_l$rate_val_pr), lin = arpr_l$lin)
                      })
                 arprs <- rbindlist(lapply(arprl, function(x) x[[1]]))
                 arprlin <- rbindlist(lapply(arprl, function(x) x[[2]]))

                 setnames(arprlin, names(arprlin), c(names(arpr_id), var_nams))
                 arpr_m <- merge(arpr_m, arprlin, all.x = TRUE, by = names(arpr_id))
                 arpr_v <- rbind(arpr_v, arprs)
           }
     } else { arprl <- lapply(1:nrow(period1_agg), function(j) {
                           if (!is.null(period)) {
                                         rown <- period_agg[j]
                                         rown <- merge(rown, quantile, all.x = TRUE,
                                                       by = names(rown))
                                       } else rown <- quantile
                           ind2 <- (rowSums(period1 == period1_agg[j,][ind0,]) == ncol(period1))

                           arpr_l <- arprlinCalc(Y1 = Y[ind2],
                                                 ids = arpr_id[ind2],
                                                 wght1 = weight[ind2],
                                                 indicator = ind0[ind2],
                                                 Y_thresh = Y_thres[ind2],
                                                 wght_thresh = wght_thres[ind2],
                                                 percent = percentage,
                                                 order_quants = order_quant,
                                                 quant_val = rown[["quantile"]])
                          if (!is.null(period)) {
                                   arprs <- data.table(period_agg[j], arpr = arpr_l$rate_val_pr)
                             } else arprs <- data.table(arpr = arpr_l$rate_val_pr)
                          list(arpr = arprs, lin = arpr_l$lin)
                       })
               arpr_v <- rbindlist(lapply(arprl, function(x) x[[1]]))
               arpr_m <- rbindlist(lapply(arprl, function(x) x[[2]]))
               setnames(arpr_m, names(arpr_m), c(names(arpr_id), var_name))
            }
     arpr_m[is.na(arpr_m)] <- 0
     setkeyv(arpr_m, names(arpr_id))
     return(list(quantile = quantile, threshold = threshold, value = arpr_v, lin = arpr_m))
}



## workhorse
arprlinCalc <- function(Y1, ids, wght1, indicator,
                        Y_thresh, wght_thresh, percent,
                        order_quants = NULL, quant_val) {

    N <- dat <- eqIncome1 <- NULL

    #---- 1. Linearization of the poverty threshold ----
    arpt_calcs <- arptlinCalc(inco = Y_thresh, ids = ids,
                              wght =  wght_thresh,
                              indicator = rep(1, length(ids)),
                              order_quants = order_quants,
                              quant_val = quant_val,
                              percentag = percent)
    lin_thres <- arpt_calcs[[names(arpt_calcs)[2]]]

    thres_val <- percent / 100 * quant_val
    wt <- indicator * wght1
    N <- sum(wt)       # Estimated (sub)population size

    poor <- (Y1 <= thres_val)
    rate_val <- sum(wt * poor) / N  # Estimated poverty rate */
    rate_val_pr <- 100 * rate_val


    #---- 2. Linearization of the poverty rate -----

    h <- bandwith_plug(y = Y1, w = wt)

    f_quant2 <- gaussian_kern(inco = Y1, wt = wt,
                              quant_val = quant_val, hh = h)
    
 #****************************************************************************************
 #                       LINEARIZED VARIABLE OF THE POVERTY RATE (IN %)                  *
 #****************************************************************************************
    lin <- 100 * ((1 / N) * indicator * ((Y1 <= thres_val) - rate_val) + f_quant2 * lin_thres)

    lin_id <- data.table(ids, lin)
    return(list(rate_val = rate_val, rate_val_pr = rate_val_pr, lin = lin_id))
}