R/utils.R

Defines functions is_lissy_machine get_index_hh_heads get_database database_character_to_name four_digit_year zero_range is.all.same.value return_desired_format extract_information_from_file_name is_change_possible read_file_name_format change_file_name_format define_option is.household.dataset extend_breaks_min_and_max all.values.equal add_leading_0s intervals_with_single_value n_numbers_smaller split_consecutive_values is.all.na.or.zero

Documented in add_leading_0s all.values.equal change_file_name_format database_character_to_name define_option extend_breaks_min_and_max extract_information_from_file_name four_digit_year get_database get_index_hh_heads intervals_with_single_value is.all.na.or.zero is.all.same.value is_change_possible is.household.dataset is_lissy_machine n_numbers_smaller read_file_name_format return_desired_format split_consecutive_values

# utils


#' Test if all objects are NAs or 0
#'
#' @keywords internal
is.all.na.or.zero <- function(x){

 return(length(x[!is.na(x) & x != 0]) == 0)

}

#' @importFrom magrittr %>%
#' @export
`%>%` <- magrittr::`%>%`

#' @importFrom magrittr %<>%
#' @export
`%<>%` <- magrittr::`%<>%`



#' Split vector into consecutive values
#'
#' Split a vector into groups of consecutive values.
#'
#' @param x a vector with values that should be split into groups.
#'
#' @keywords internal
split_consecutive_values <- function(x){

  split(x, cumsum(diff(c(-Inf, x)) != 1))

}



#' Number of elements smaller than input
#'
#' Computes the number of elements in 'numbers' smaller than each of the
#' elements in input 'x'.
#'
#' @param x A numeric vector with input elements
#' @param numbers A numeric vector with numbers which
#'
#' @return A vector of integer values of same length as x
#'
#' @keywords internal
n_numbers_smaller <- function(x, numbers){

  purrr::map_int(x, ~ sum(numbers < .x))

}




#' Interval has single value
#'
#' Check if interval beginning and end are equal and thus it's composed
#' of a single value.
#'
#' To be used inside 'compute_lorenz_curve()'
#'
#' @param A string with the interval.
#'
#' @return A boolean indicating if the first and second element of the interval are
#' the same value.
#'
#' @keywords internal
intervals_with_single_value <- function(x){

  x <- stringr::str_remove_all(x, pattern = "[\\(\\)\\]\\[]")

  split_x <- stringr::str_split(string = x,
                     pattern = ",",
                     simplify = TRUE)

  assertthat::assert_that(length(split_x) == 2)

  return( isTRUE(all.equal(split_x[,1], split_x[,2])) )

}



#' Add leading 0s
#'
#'
#' To be used inside 'compute_lorenz_curve()'
#'
#' @keywords internal
add_leading_0s <- function(dataset, n_0s){

  output_dataset <- tibble::tibble(percentile = rep("0", times = n_0s),
                                  wght_sum_agg_in_percentile = rep(0, times = n_0s)) %>%
    dplyr::bind_rows(dataset)

  return(output_dataset)

}



#' All values are equal
#' \lifecycle{experimental}
#'
#' Like base::all.equal(), but ignoring attributes by default.
#'
#' @keywords internal
all.values.equal <- function(target, current){

  all.equal(target = target, current = current,
            check.attributes = FALSE, check.names = FALSE)

}


#' Extend the minimum and maximum of a vector used for breaks
#'
#' The 'cut' function can sometimes fail to include the minimum and maximum
#' values into intervals due to 'the floating point trap'.
#' This function modifies the breaks by extending the
#' lower and uppermost values so it always includes the minimum and maximum.
#'
#' @param x Numeric vector with all values that intervals should include.
#' @param breaks Numeric vector with the breaks that might need to be extended.
#' @param extension A real number by which the maximum and minimum values should
#'     be extended if the originals don't include all values in 'x'.
#'
#' @keywords internal
extend_breaks_min_and_max <- function(x, breaks, extension = 0.0001){

  index_max_breaks <- length(breaks)

  if(isTRUE(all.values.equal(min(x), breaks[1])) & (min(x) < breaks[1]) ){
    breaks[1] <- breaks[1]-extension
  }

  if(isTRUE(all.values.equal(max(x), breaks[index_max_breaks] )) & (max(x) > breaks[index_max_breaks]) ){
    breaks[index_max_breaks] <- breaks[index_max_breaks]+extension
  }

  if(!all(dplyr::between(x, breaks[1], breaks[index_max_breaks]))){
    warning("Not all values of 'x' are included within the breaks after extending these.")
  }

  return(breaks)

}


#' Checks if the dataset is at household level
#'
#' This function checks if there is an 'h' in the name of a LIS or LWS dataset.
#'
#'@param dataset_name The name of a LIS or LWS dataset.
#'
#'@return A boolean.
#'
#'@keywords internal
is.household.dataset <- function(dataset_name){

  stringr::str_detect(dataset_name,
                      pattern = "[a-z]{2}\\d{2,4}[iw]{0,1}h")

}





#' Define an option
#'
#' This function defines an option by using the name of the new option and its
#' value. Can be used to define options programmatically.
#'
#'@param option_name The name of the option.
#'@param option_name The value of the option.
#'
#'
#'@keywords internal
define_option <- function(option_name, value){

  opt_ <- options(option_name)
  opt_[[option_name]] <- value
  options(opt_)

}


#' Change the format of a file name
#'
#' \lifecycle{experimental}
#'
#' Converts a character vector with a file names to a different format.
#'
#' Is used within 'implement_adjust_by_lisppp()', .
#'
#'
#' @param file_names A string with the name of the file.
#' @param to_format A string with the desired format. This can be one of the following:
#' \itemize{
#'   \item 'ccyy' Two digit country, two digit year. E.g. "fr84"
#'   \item 'ccyyyy' Two digit country, four digit year. E.g. "fr1984"
#'   \item 'ccyyl' Two digit country, two digit year, one digit level ('h' or 'p' for household/person). E.g. "fr84h"
#'   \item 'ccyyyyl' Two digit country, four digit year, one digit level. E.g. "fr1984h"
#'   \item 'ccyyd' Two digit country, two digit year, one digit database (LIS, LWS or ERFLIS - i/w/e). E.g. "fr84i"
#'   \item 'ccyyyyd' Two digit country, four digit year, one digit database (LIS, LWS or ERFLIS - i/w/e). E.g. "fr1984i"
#'   \item 'ccyydl' Two digit country, two digit year, one digit indicating LIS, LWS or ERFLIS (i/w/e/r), one digit household/person. E.g. "fr84ih"
#'   \item 'ccyyyydl' Two digit country, two digit year,one digit indicating LIS, LWS or ERFLIS (i/w/e/r), one digit household/person. E.g. "fr1984ih"
#' }
#'
#'@keywords internal
change_file_name_format <- function(file_names, to_format){

  assertthat::assert_that(to_format %in% c('ccyyyydl', 'ccyydl', 'ccyyyyd',
                                           'ccyyd', 'ccyyyyl', 'ccyyl',
                                           'ccyyyy', 'ccyy'),
                          msg = "The format passed in argument 'to_format' needs to be one of the following: /n 'ccyyyydl', 'ccyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'")

  current_formats <- purrr::map_chr(file_names,
                                    ~read_file_name_format(.x))

  if(all(current_formats == to_format)){

    return(file_names)
  }

  assertthat::assert_that(purrr::every(current_formats,
                                         ~is_change_possible(.x, to_format)),
                          msg = "The change in format of the file name is not possible.")


  output_ <- purrr::map2_chr(file_names, current_formats,
                         .f = function(file_name, current_format, to_format){

                           temp_list_information_from_file <- extract_information_from_file_name(file_name = file_name,
                                                                                                 current_format)

                           return(return_desired_format(country = temp_list_information_from_file["country_"],
                                                        year = temp_list_information_from_file["year_"],
                                                        database = temp_list_information_from_file["database_"],
                                                        level = temp_list_information_from_file["level_"],
                                                        to_format = to_format))
                         },
                         to_format)

  return(output_)

}


#' Read the format of a file name
#'
#' @param file_name A string with the name of the file.
#'
#' @return A string indicating the format of the file.
#'
#' @keywords internal
read_file_name_format <- function(file_name){

  if(stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}[iwe][hp]$") ){
    return("ccyyyydl")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}[iwe][hp]$") ){
    return("ccyydl")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}[hp]$") ){
    return("ccyyyyl")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}[hp]$") ){
    return("ccyyl")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}[iwe]$") ){
    return("ccyyyyd")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}[iwe]$") ){
    return("ccyyd")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}$") ){
    return("ccyyyy")
  }else if( stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}$") ){
    return("ccyy")
  }else{
    stop(glue::glue("'{file_name}' is not a valid file name format."))
  }

}



#' Is possible to produce the desired format change
#'
#' To be used inside 'change_file_name_format()'
#'
#' @param current_format A string with the currant format.
#' @param to_format A string with the desired format.
#'
#' @return A boolean.
#'
#' @keywords internal
is_change_possible <- function(current_format, to_format){

  possible_list <- list(
    'ccyyyydl' = c('ccyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'),
    'ccyydl' = c('ccyyyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'),
    'ccyyyyd' = c('ccyyd', 'ccyyyy', 'ccyy'),
    'ccyyd' = c('ccyyyyd', 'ccyyyy', 'ccyy'),
    'ccyyyyl' = c('ccyyl', 'ccyyyy', 'ccyy'),
    'ccyyl' = c('ccyyyyl', 'ccyyyy', 'ccyy'),
    'ccyyyy' = c('ccyy'),
    'ccyy' = c('ccyyyy')
  )

  return((to_format == current_format) | (to_format %in% possible_list[[current_format]]))

}




#' Extract information from file name
#'
#' Extracts the information regarding country, year, database and level. The
#'   latter two only if available.
#'
#' To be used inside 'change_file_name_format()'.
#'
#' @param file_name A string with the name of the file.
#' @param current_format A string with the currant format.
#'
#' @return A list with the following elements:
#'   \itemize{
#'    \item 'country_' ('cc')
#'    \item 'year_' ('yyyy')
#'    \item 'database_' ('d')
#'    \item 'level_' ('l')
#'    }
#'
#' @keywords internal
extract_information_from_file_name <- function(file_name, current_format){

  assertthat::assert_that(current_format %in% c('ccyyyydl', 'ccyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'),
                          msg = "Argument 'current_format' needs to be one of the following: 'ccyyyydl', 'ccyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'")

  # extract information from current format
  country_ <- stringr::str_extract(file_name, "^[A-z]{2}")

  ## how many 'y's does 'current_format' have?
  if(stringr::str_count(current_format, "y") == 2){

    year_ <- stringr::str_match(file_name, "^[A-z]{2}([0-9]{2})")[,2]

    year_ <- four_digit_year(year_)

  }else if(stringr::str_count(current_format, "y") == 4){

    year_ <- stringr::str_match(file_name, "^[A-z]{2}([0-9]{4})")[,2]

  }

  if(stringr::str_detect(current_format, "d")){

    database_ <- stringr::str_match(file_name, "^[A-z]{2}[0-9]{2,4}([iwe]{1})")[,2]

  }else{

    database_ <- ""

  }

  if(stringr::str_detect(current_format, "l")){

    level_ <- stringr::str_extract(file_name, "[hpr]$")

  }else{

    level_ <- ""

  }

  # assert that no element is NA

  assertthat::assert_that(!is.na(country_) && !is.na(year_) && !is.na(database_) && !is.na(level_),
                          msg = glue::glue("Some extracted elements are NA. Please make sure that '{file_name}' has format '{current_format}'."))


  return(
    list(
      country_ = country_,
      year_ = year_,
      database_ = database_,
      level_ = level_
    )
  )

}




#' Return a file name with the requested format
#'
#' Combines the information passed and returns it in the format stipulated in
#'   argument 'to_format'.
#'
#' To be used inside 'change_file_name_format()'.
#'
#' @param country A string with the country as 'cc'.
#' @param year A string with four digit year as 'yyyy'.
#' @param database A string with database as 'd'.
#' @param level A string with level as 'l'.
#' @param to_format A string with the desired format.
#'
#' @return A string with the file name in the stipulated format.
#'
#' @keywords internal
return_desired_format <- function(country, year, database, level, to_format){

  if(stringr::str_count(to_format, "y") == 2){

    year_out <- stringr::str_sub(year,
                                 start = -2, end = -1)

  }else if(stringr::str_count(to_format, "y") == 4){

    year_out <- stringr::str_sub(year,
                                 start = -4, end = -1)
  }


  if(stringr::str_detect(to_format, "d")){

    database_out <- database

  }else{

    database_out <- ""

  }


  if(stringr::str_detect(to_format, "l")){

    level_out <- level

  }else{

    level_out <- ""

  }

  return(stringr::str_c(country, year_out, database_out, level_out))

}




#' Test if all elements of a vector have the same value
#'
#' @keywords internal
is.all.same.value <- function(x, na.rm = FALSE){

  assertthat::assert_that(rlang::is_vector(x) & !rlang::is_list(x),
                          msg = "The element passed to argument 'x' must be a vector and can not be a list.")

  if(!na.rm){
    assertthat::assert_that(all(!is.na(x)),
                            msg = "'x' can not have NAs if 'na.rm = FALSE'")
  }else if (na.rm & any(is.na(x))){

    x <- x[!is.na(x)]

  }

  assertthat::assert_that(!is.null(x),
                          msg = "'x' can not be null.")


  assertthat::assert_that(length(x) > 0,
                          msg = "'x' can not have length 0.")

  if(is.numeric(x)){

    return(zero_range(x)) # avoid problem with floating point

  }else{

    return(length(unique(x)) == 1)

  }

}

# copy pasted from:
# https://stackoverflow.com/questions/4752275/test-for-equality-among-all-elements-of-a-single-numeric-vector
zero_range <- function(x, tol = .Machine$double.eps ^ 0.5) {
  if (length(x) == 1) return(TRUE)
  x <- range(x) / mean(x)
  isTRUE(all.equal(x[1], x[2], tolerance = tol))
}



#' Compute four digit year from two digit one.
#'
#' \lifecycle{experimental}
#' Computes the four digit year from a string with two digits year.
#'
#' To be used within 'extract_information_from_file_name()'.
#'
#' @param two_digit_year An integer or string value with two digits (from 00 to 99).
#' @return A strign with four digit year.
#'
#' @keywords internal
four_digit_year <- function(two_digit_year){

  assertthat::assert_that( stringr::str_detect(two_digit_year, pattern = "^\\d{2}$"),
                           msg = "Argument 'two_digit_year' should be two digits (00 to 99).")

  dplyr::if_else(dplyr::between(as.numeric(stringr::str_sub(two_digit_year, 1, 2)), 0, 50),
                 true = stringr::str_c("20", stringr::str_sub(two_digit_year, 1, 2) ),
                 false = stringr::str_c("19", stringr::str_sub(two_digit_year, 1, 2)))

}



#' Converts 'i', 'w', 'e' into 'LIS', 'LWS' 'ERFLIS'
#'
#' @keywords internal
database_character_to_name <- function(x){

  x <- tolower(x)
  assertthat::assert_that(stringr::str_detect(x, pattern = "[iwe]"),
                          msg = glue::glue("'{x}' is not a valid database character. Only 'i', 'w' and 'e' are."))


  return(
    switch(EXPR = x,
           'i' = "LIS",
           'w' = "LWS",
           'e' = "ERFLIS"
    )
  )

}



#' Get the database from a list of files
#'
#' Retrieves the 'database' attribute from a list of files.
#'
#' @param lissy_files A list of lissy h-level, p-level or merged files.
#'
#' @keywords internal
get_database <- function(lissy_files){

  assertthat::assert_that(!is.null(attr(lissy_files, "database")),
                          msg = "Attribute 'database' is NULL.")

  database_ <- attr(lissy_files, "database")

  assertthat::assert_that(length(database_) == 1 && !is.null(database_),
                          msg = "Attribute 'database' must have length 1.")

  assertthat::assert_that(database_ %in% c("lis", "lws", "erflis", "i", "w", "e"),
                          msg = glue::glue("Only 'lis', 'lws', 'erflis', 'i', 'w' and 'e' are valid values for databases. Got '{database_}'."))

  return(database_)

}


#' Get the index of hh-heads
#'
#' @description
#' Returns a logical vector indicating which rows correspond to household
#'   heads.
#'
#' @param file A tibble or data.frame with a LIS or LWS file.
#'
#' @return A logical vector with TRUE for hh-heads and FALSE otherwise.
#' @keywords internal
get_index_hh_heads <- function(file){

  relation_ <- file[["relation"]]

  if(is.factor(relation_)){

    return(!is.na(relation_) & readr::parse_number(as.character(relation_)) == 1000)

  }else{

    return(!is.na(relation_) & relation_ == 1000)

  }

}


#' Is a LISSY machine
#'
#' @description
#' Returns a boolean indicating if the code is running on a LISSY machine
#'
#' @return a boolean.
#' @keywords internal
is_lissy_machine <- function(){

  Sys.info()[["effective_user"]] == "lissy"

}
JosepER/lissyrtools documentation built on Jan. 26, 2025, 10:01 p.m.