R/print.R

Defines functions print_ratio print_atkinson print_gini determine_file_level determine_weight print_indicator print_all_lissy_files print_percentiles

Documented in determine_file_level determine_weight print_all_lissy_files print_atkinson print_gini print_indicator print_percentiles print_ratio

# print.R

#' Print percentiles.
#'
#' \lifecycle{experimental}
#' Computes and displays the percentiles and cumulative percentiles of a
#'   variable.
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param variable A character vector of length one.
#' @param breaks A numeric vector with specifying the percentiles that should be computed. Defaults to deciles.
#' @param weight A string with the name of the variable in 'file' that should be used as sample weights.
#' @param na.rm A boolean indicating if missing values should be ignored. Defaults to FALSE.
#' @return A tibble with percentile absolute and cummulative values.
#' @examples
#' \dontrun{
#' lissy_files <- read_lissy_files(c("fr84h", "fr94h", "fr10h"))
#' print_percentiles(lissy_files = lissy_files, variable = "dhi")
#' }
print_percentiles <- function(lissy_files, variable, breaks = seq(0, 1, 0.1), weight = NULL, na.rm = FALSE){

  purrr::imap(lissy_files, .f = function(file, file_name){

    out_ <- compute_percentiles(file = file, file_name = file_name, variable = variable, weight = weight, breaks = breaks, na.rm = na.rm)
    out_[[paste0("cum_", file_name)]] <- cumsum(out_[["value"]])/sum(out_[["value"]])

    index_col_value <- which(names(out_) == "value")
    names(out_)[2] <- paste0("value_", file_name)

    return(out_)

  }) %>%
    purrr::reduce(dplyr::left_join, by = "percentile")
}




#' Print all available files
#'
#' \lifecycle{experimental}
#' Display all available LISSY files.
#'
#' @examples
#' \dontrun{
#' print_all_lissy_files()
#' }
print_all_lissy_files <- function(database){

  if(is_lissy_machine){

    stringr::str_subset(fs::dir_ls(LIS_DIR), pattern = "\\w{2}\\d{2}\\w[phr][.]dta") %>%
      stringr::str_match("(\\w{2}\\d{2}\\w)[phr][.]dta") %>%
      .[,2] %>%
      unique()
  }
}


#' Print an indicator
#'
#' \lifecycle{experimental}
#' Computes and displays an indicator chosen with the 'indicator' argument.
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param variable A character string indicating the aggregate for which the indicator needs to be computed.
#' @param indicator A character string indicating the type of indicator statistic to be computed.
#'   Currently the function supports only 'mean', 'median', 'ratio', 'gini' and 'atkinson'.
#'   If 'ratio' is selected, the user must pass a value to the 'ratio' parameter.
#'   If 'atkinson', the user mus pass a value to 'epsilon'.
#' @param weight A string with the name of the variable in 'file' that should be
#'   used as sample weights. If NULL (default), the function tries to guess the
#'   needed weight to compute the indicator. This guess is made on the information
#'   from files_level and variable_level.
#' @param ratio A vector of two numeric values between 0 and 1.Only used in the computation of 'ratio' indicator.
#'   Defines the percentiles in the numerator and denominator respectively.
#'   E.g. (0.9, 0.1) computes the 90/10 ratio.
#' @param epsilon A numeric vector of length one. Only used in the computation of 'atkinson' indicator'.
#'   The inequality adversion parameter. Needs to be epsilon > 0.
#' @param na.rm A boolean. Indicates if NAs should be ignored. Defaults to FALSE.
#' @param files_level  A string indicating the level of the file. Valid inputs are:
#'   'household', 'h', 'person' or 'p'. If NULL (default), the file level will
#'   be retrived from the 'lissy_files' attributes.
#' @param variable_level Level of the variable. Should be either 'household', 'h', 'person' or 'p'.
#'   If NULL (default), the function will try to guess the level of the variable.
#'   This is done by comparing the value in 'variable' with pre-set lists of variables.
#'
#' @return A numeric vector.
print_indicator <- function(lissy_files, variable, indicator, weight = NULL, ratio = NULL, epsilon = NULL, na.rm = FALSE, files_level = NULL, variable_level = NULL){

  assertthat::assert_that(indicator %in% c("mean", "median", "ratio", "gini", "atkinson"),
                          msg = "Currently supported indicators are 'mean', 'median', 'ratio', 'atkinson' and 'gini'.")


  if(missing(weight)){

    weight_var <- determine_weight(lissy_files, variable, files_level, variable_level)
    message(glue::glue("{weight_var} will be used as weighting variable."))

  } else {
    weight_var <- weight
  }

  level_ <- determine_file_level(lissy_files, files_level)


  # for hh-level files, multiply the variable by 'nhhmem'
  if(!is.null(level_) && level_ %in% c("household", "h")){

    assertthat::assert_that(purrr::every(lissy_files,
                                         ~"nhhmem" %in% names(.x) ),
                            msg = "All files in 'lissy_data' should contain the 'nhhmem' variable if 'print_indicator()' is used on household-level files.")

    lissy_files <- lissy_files %>%
      purrr::map(.f = function(file){
        file[[weight_var]] <- file[[weight_var]] * file[["nhhmem"]]
        return(file)
      })

  }


  if(indicator == "mean"){

    purrr::imap_dbl(lissy_files, .f = ~compute_mean(file = ..1, # .x
                                                    file_name = ..2, # .y
                                                    variable = ..3,
                                                    weight = ..4,
                                                    na.rm = ..5),
                    variable, # ..3
                    weight_var, # ..4
                    na.rm # ..5
                    )

  }else if(indicator == "median"){

    purrr::imap_dbl(lissy_files, .f = ~compute_median(file = ..1, # .x
                                                     file_name = ..2, # .y
                                                     variable = ..3,
                                                     weight = ..4,
                                                     na.rm = ..5),
                   variable, # ..3
                   weight_var, # ..4
                   na.rm # ..5
    )

  }else if(indicator == "ratio"){

    if(is.null(ratio)){
      ratio <- c(0.9, 0.1)
      warning("No input was passed to argument 'ratio' so 'ratio = c(0.9, 0.1)' was used as default.")
    }

    purrr::imap_dbl(lissy_files, .f = ~compute_ratio(file = ..1, # .x
                                                    file_name = ..2, # .y
                                                    variable = ..3,
                                                    weight = ..4,
                                                    ratio = ..5,
                                                    na.rm = ..6),
                   variable, # ..3
                   weight_var, # ..4
                   ratio, # ..5
                   na.rm # ..6
                   )

  }else if(indicator == "gini"){

    purrr::imap_dbl(lissy_files,
                    .f = ~compute_gini(file = ..1, # .x
                                       file_name = ..2, # .y,
                                       variable = ..3,
                                       weight = ..4,
                                       na.rm= ..5),
                    variable, # ..3
                    weight_var, # ..4
                    na.rm # ..5
    )

  } else if(indicator == "atkinson"){

    purrr::imap_dbl(lissy_files,
                    .f = ~compute_atkinson(file = ..1, # .x
                                           file_name = ..2, # .y
                                           variable = ..3,
                                           epsilon = ..4,
                                           weight = ..5,
                                           na.rm = ..6),
                    variable, # ..3
                    epsilon, # ..4
                    weight_var, # ..5
                    na.rm # ..6
    )
  }
}


#' Determine the name of the weight variable
#'
#' Computes the name of the weight variable based on the information from the 'lissy_files'
#'   object, 'files_level' and 'variable_level'
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param variable A character string indicating the aggregate for which the indicator needs to be computed.
#' @param files_level  A string indicating the level of the file. Valid inputs are:
#'   'household', 'h', 'person' or 'p'. If NULL (default), the file level will
#'   be retrived from the 'lissy_files' attributes.
#' @param variable_level Level of the variable. Should be either 'household', 'h', 'person' or 'p'.
#'   If NULL (default), the function will try to guess the level of the variable.
#'   This is done by comparing the value in 'variable' with pre-set lists of variables.
#' @return A character vector with the name of the weight variable.
determine_weight <- function(lissy_files, variable, files_level, variable_level){

  level_ <- determine_file_level(lissy_files, files_level)

  if(!is.null(level_) && level_ %in% c("household", "h")){
    return("hwgt")
  }

  if(is.null(variable_level)){

    variable_level <- check_variable_level(variable)

  }else{

    assertthat::assert_that(variable_level %in% c("person", "household", "p", "h"),
                            msg = "Argument 'variable_level' can only take 'person', 'p', 'household' or 'h' as values.")
  }

  if(variable_level %in% c("person", "p")){

    return("pwgt")

  }else if(variable_level %in% c("household", "h")){

    return("hwgt")

  }

}


#' Determine the level of the file
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param files_level  A string indicating the level of the file. Valid inputs are:
#'   'household', 'h', 'person' or 'p'. If NULL (default), the file level will
#'   be retrived from the 'lissy_files' attributes.
#'
#' @return A character vector with the level of the file
determine_file_level <- function(lissy_files, files_level){

  if(missing(files_level)){

    level_ <- get_lissy_attributes(lissy_files)[["level"]]

  }else{

    level_ <- files_level

  }

  return(level_)
}



#' Print the Gini Coefficient
#'
#' \lifecycle{experimental}
#' Computes and displays the Gini coefficient for a given variable across multiple files.
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param variable A character string indicating the variable for which the Gini coefficient needs to be computed.
#' @param weight A string with the name of the variable in 'file' that should be used as sample weights.
#'   If NULL (default), the function tries to guess the needed weight to compute the Gini coefficient.
#'   This guess is made based on the information from files_level and variable_level.
#' @param na.rm A boolean. Indicates if NAs should be ignored. Defaults to FALSE.
#' @param files_level A string indicating the level of the file. Valid inputs are:
#'   'household', 'h', 'person' or 'p'. If NULL (default), the file level will be retrieved from the 'lissy_files' attributes.
#' @param variable_level Level of the variable. Should be either 'household', 'h', 'person' or 'p'.
#'   If NULL (default), the function will try to guess the level of the variable.
#'   This is done by comparing the value in 'variable' with pre-set lists of variables.
#'
#' @return A numeric vector containing the Gini coefficient for each file.
#'
#' @examples
#' \dontrun{
#' lissy_files <- read_lissy_files(c("file1", "file2"))
#' print_gini(lissy_files = lissy_files, variable = "income")
#' }
print_gini <- function(lissy_files, variable, weight = NULL, na.rm = FALSE, files_level = NULL, variable_level = NULL) {

  if(missing(weight)){
    weight_var <- determine_weight(lissy_files, variable, files_level, variable_level)
    message(glue::glue("{weight_var} will be used as weighting variable."))
  }

  level_ <- determine_file_level(lissy_files, files_level)

  # for hh-level files, multiply the variable by 'nhhmem'
  if(!is.null(level_) && level_ %in% c("household", "h")){
    assertthat::assert_that(purrr::every(lissy_files,
                                         ~"nhhmem" %in% names(.x) ),
                            msg = "All files in 'lissy_data' should contain the 'nhhmem' variable if 'print_gini()' is used on household-level files.")

    lissy_files <- lissy_files %>%
      purrr::map(.f = function(file){
        file[[weight_var]] <- file[[weight_var]] * file[["nhhmem"]]
        return(file)
      })
  }

  # Compute Gini
  purrr::imap_dbl(lissy_files,
                  .f = ~compute_gini(file = ..1, # .x
                                     file_name = ..2, # .y,
                                     variable = ..3,
                                     weight = ..4,
                                     na.rm= ..5),
                  variable, # ..3
                  weight_var, # ..4
                  na.rm # ..5
  )
}



#' Print the Atkinson Index
#'
#' \lifecycle{experimental}
#' Computes and displays the Atkinson index for a given variable across multiple files.
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param variable A character string indicating the variable for which the Atkinson index needs to be computed.
#' @param epsilon A numeric vector of length one. The inequality aversion parameter. Needs to be epsilon > 0.
#' @param weight A string with the name of the variable in 'file' that should be used as sample weights.
#'   If NULL (default), the function tries to guess the needed weight to compute the Atkinson index.
#'   This guess is made based on the information from files_level and variable_level.
#' @param na.rm A boolean. Indicates if NAs should be ignored. Defaults to FALSE.
#' @param files_level A string indicating the level of the file. Valid inputs are:
#'   'household', 'h', 'person' or 'p'. If NULL (default), the file level will be retrieved from the 'lissy_files' attributes.
#' @param variable_level Level of the variable. Should be either 'household', 'h', 'person' or 'p'.
#'   If NULL (default), the function will try to guess the level of the variable.
#'   This is done by comparing the value in 'variable' with pre-set lists of variables.
#'
#' @return A numeric vector containing the Atkinson index for each file.
#'
#' @examples
#' \dontrun{
#' lissy_files <- read_lissy_files(c("file1", "file2"))
#' print_atkinson(lissy_files = lissy_files, variable = "income", epsilon = 0.5)
#' }
print_atkinson <- function(lissy_files, variable, epsilon, weight = NULL, na.rm = FALSE, files_level = NULL, variable_level = NULL) {

  if(missing(weight)){
    weight_var <- determine_weight(lissy_files, variable, files_level, variable_level)
    message(glue::glue("{weight_var} will be used as weighting variable."))
  }

  level_ <- determine_file_level(lissy_files, files_level)

  # for hh-level files, multiply the variable by 'nhhmem'
  if(!is.null(level_) && level_ %in% c("household", "h")){
    assertthat::assert_that(purrr::every(lissy_files,
                                         ~"nhhmem" %in% names(.x) ),
                            msg = "All files in 'lissy_data' should contain the 'nhhmem' variable if 'print_atkinson()' is used on household-level files.")

    lissy_files <- lissy_files %>%
      purrr::map(.f = function(file){
        file[[weight_var]] <- file[[weight_var]] * file[["nhhmem"]]
        return(file)
      })
  }

  # Compute Atkinson
  purrr::imap_dbl(lissy_files,
                  .f = ~compute_atkinson(file = ..1, # .x
                                         file_name = ..2, # .y,
                                         variable = ..3,
                                         weight = ..4,
                                         epsilon = ..5,
                                         na.rm= ..6),
                  variable, # ..3
                  weight_var, # ..4
                  epsilon, # ..5
                  na.rm # ..6
  )
}

#' Print the Ratio Index
#'
#' \lifecycle{experimental}
#' Computes and displays the ratio index for a given variable across multiple files.
#'
#' @param lissy_files A list of LIS or LWS files.
#' @param variable A character string indicating the variable for which the ratio index needs to be computed.
#' @param ratio A vector of two numeric values between 0 and 1.
#'   Defines the percentiles in the numerator and denominator respectively.
#'   E.g. (0.9, 0.1) computes the 90/10 ratio.
#' @param weight A string with the name of the variable in 'file' that should be used as sample weights.
#'   If NULL (default), the function tries to guess the needed weight to compute the ratio index.
#'   This guess is made based on the information from files_level and variable_level.
#' @param na.rm A boolean. Indicates if NAs should be ignored. Defaults to FALSE.
#' @param files_level A string indicating the level of the file. Valid inputs are:
#'   'household', 'h', 'person' or 'p'. If NULL (default), the file level will be retrieved from the 'lissy_files' attributes.
#' @param variable_level Level of the variable. Should be either 'household', 'h', 'person' or 'p'.
#'   If NULL (default), the function will try to guess the level of the variable.
#'   This is done by comparing the value in 'variable' with pre-set lists of variables.
#'
#' @return A numeric vector containing the ratio index for each file.
#'
#' @examples
#' \dontrun{
#' lissy_files <- read_lissy_files(c("file1", "file2"))
#' print_ratio(lissy_files = lissy_files, variable = "income", ratio = c(0.9, 0.1))
#' }
print_ratio <- function(lissy_files, variable, ratio, weight = NULL, na.rm = FALSE, files_level = NULL, variable_level = NULL) {

  if(missing(weight)){
    weight_var <- determine_weight(lissy_files, variable, files_level, variable_level)
    message(glue::glue("{weight_var} will be used as weighting variable."))
  }

  level_ <- determine_file_level(lissy_files, files_level)

  # for hh-level files, multiply the variable by 'nhhmem'
  if(!is.null(level_) && level_ %in% c("household", "h")){
    assertthat::assert_that(purrr::every(lissy_files,
                                         ~"nhhmem" %in% names(.x) ),
                            msg = "All files in 'lissy_data' should contain the 'nhhmem' variable if 'print_ratio()' is used on household-level files.")

    lissy_files <- lissy_files %>%
      purrr::map(.f = function(file){
        file[[weight_var]] <- file[[weight_var]] * file[["nhhmem"]]
        return(file)
      })
  }

  # Compute Ratio
  purrr::imap_dbl(lissy_files,
                  .f = ~compute_ratio(file = ..1, # .x
                                      file_name = ..2, # .y,
                                      variable = ..3,
                                      weight = ..4,
                                      ratio = ..5,
                                      na.rm= ..6),
                  variable, # ..3
                  weight_var, # ..4
                  ratio, # ..5
                  na.rm # ..6
  )
}
JosepER/lissyrtools documentation built on Jan. 26, 2025, 10:01 p.m.