lissyrtools: Tools for LISSY Jobs

Documented in add_leading_0s all.values.equal ccyy_to_cname ccyy_to_yyyy change_file_name_format check_input_in_weight_argument check_missing_share convert_list_from_ccyy_to_cc_names_yyyy database_character_to_name define_option extend_breaks_min_and_max extract_information_from_file_name four_digit_year get_database get_index_hh_heads intervals_with_single_value is.all.na.or.zero is.all.same.value is_change_possible is.household.dataset is_lissy_machine n_numbers_smaller read_file_name_format remove_dname_with_missings_in_weights return_desired_format split_consecutive_values

# utils

#' Test if all objects are NAs or 0
#'
#' @keywords internal
is.all.na.or.zero <- function(x) {
  return(length(x[!is.na(x) & x != 0]) == 0)
}

#' @importFrom magrittr %>%
#' @export
`%>%` <- magrittr::`%>%`

#' @importFrom magrittr %<>%
#' @export
`%<>%` <- magrittr::`%<>%`


#' Split vector into consecutive values
#'
#' Split a vector into groups of consecutive values.
#'
#' @param x a vector with values that should be split into groups.
#'
#' @keywords internal
split_consecutive_values <- function(x) {
  split(x, cumsum(diff(c(-Inf, x)) != 1))
}


#' Number of elements smaller than input
#'
#' Computes the number of elements in 'numbers' smaller than each of the
#' elements in input 'x'.
#'
#' @param x A numeric vector with input elements
#' @param numbers A numeric vector with numbers which
#'
#' @return A vector of integer values of same length as x
#'
#' @keywords internal
n_numbers_smaller <- function(x, numbers) {
  purrr::map_int(x, ~ sum(numbers < .x))
}


#' Interval has single value
#'
#' Check if interval beginning and end are equal and thus it's composed
#' of a single value.
#'
#' To be used inside 'compute_lorenz_curve()'
#'
#' @param A string with the interval.
#'
#' @return A boolean indicating if the first and second element of the interval are
#' the same value.
#'
#' @keywords internal
intervals_with_single_value <- function(x) {
  x <- stringr::str_remove_all(x, pattern = "[\\(\\)\\]\\[]")

  split_x <- stringr::str_split(string = x, pattern = ",", simplify = TRUE)

  assertthat::assert_that(length(split_x) == 2)

  return(isTRUE(all.equal(split_x[, 1], split_x[, 2])))
}


#' Add leading 0s
#'
#'
#' To be used inside 'compute_lorenz_curve()'
#'
#' @keywords internal
add_leading_0s <- function(dataset, n_0s) {
  output_dataset <- tibble::tibble(
    percentile = rep("0", times = n_0s),
    wght_sum_agg_in_percentile = rep(0, times = n_0s)
  ) %>%
    dplyr::bind_rows(dataset)

  return(output_dataset)
}


#' All values are equal
#' \lifecycle{experimental}
#'
#' Like base::all.equal(), but ignoring attributes by default.
#'
#' @keywords internal
all.values.equal <- function(target, current) {
  all.equal(
    target = target,
    current = current,
    check.attributes = FALSE,
    check.names = FALSE
  )
}


#' Extend the minimum and maximum of a vector used for breaks
#'
#' The 'cut' function can sometimes fail to include the minimum and maximum
#' values into intervals due to 'the floating point trap'.
#' This function modifies the breaks by extending the
#' lower and uppermost values so it always includes the minimum and maximum.
#'
#' @param x Numeric vector with all values that intervals should include.
#' @param breaks Numeric vector with the breaks that might need to be extended.
#' @param extension A real number by which the maximum and minimum values should
#'     be extended if the originals don't include all values in 'x'.
#'
#' @keywords internal
extend_breaks_min_and_max <- function(x, breaks, extension = 0.0001) {
  index_max_breaks <- length(breaks)

  if (isTRUE(all.values.equal(min(x), breaks[1])) & (min(x) < breaks[1])) {
    breaks[1] <- breaks[1] - extension
  }

  if (
    isTRUE(all.values.equal(max(x), breaks[index_max_breaks])) &
      (max(x) > breaks[index_max_breaks])
  ) {
    breaks[index_max_breaks] <- breaks[index_max_breaks] + extension
  }

  if (!all(dplyr::between(x, breaks[1], breaks[index_max_breaks]))) {
    warning(
      "Not all values of 'x' are included within the breaks after extending these."
    )
  }

  return(breaks)
}


#' Checks if the dataset is at household level
#'
#' This function checks if there is an 'h' in the name of a LIS or LWS dataset.
#'
#'@param dataset_name The name of a LIS or LWS dataset.
#'
#'@return A boolean.
#'
#'@keywords internal
is.household.dataset <- function(dataset_name) {
  stringr::str_detect(dataset_name, pattern = "[a-z]{2}\\d{2,4}[iw]{0,1}h")
}


#' Define an option
#'
#' This function defines an option by using the name of the new option and its
#' value. Can be used to define options programmatically.
#'
#'@param option_name The name of the option.
#'@param option_name The value of the option.
#'
#'
#'@keywords internal
define_option <- function(option_name, value) {
  opt_ <- options(option_name)
  opt_[[option_name]] <- value
  options(opt_)
}


#' Change the format of a file name
#'
#' \lifecycle{experimental}
#'
#' Converts a character vector with a file names to a different format.
#'
#' Is used within 'implement_adjust_by_lisppp()', .
#'
#'
#' @param file_names A string with the name of the file.
#' @param to_format A string with the desired format. This can be one of the following:
#' \itemize{
#'   \item 'ccyy' Two digit country, two digit year. E.g. "fr84"
#'   \item 'ccyyyy' Two digit country, four digit year. E.g. "fr1984"
#'   \item 'ccyyl' Two digit country, two digit year, one digit level ('h' or 'p' for household/person). E.g. "fr84h"
#'   \item 'ccyyyyl' Two digit country, four digit year, one digit level. E.g. "fr1984h"
#'   \item 'ccyyd' Two digit country, two digit year, one digit database (LIS, LWS or ERFLIS - i/w/e). E.g. "fr84i"
#'   \item 'ccyyyyd' Two digit country, four digit year, one digit database (LIS, LWS or ERFLIS - i/w/e). E.g. "fr1984i"
#'   \item 'ccyydl' Two digit country, two digit year, one digit indicating LIS, LWS or ERFLIS (i/w/e/r), one digit household/person. E.g. "fr84ih"
#'   \item 'ccyyyydl' Two digit country, two digit year,one digit indicating LIS, LWS or ERFLIS (i/w/e/r), one digit household/person. E.g. "fr1984ih"
#' }
#'
#'@keywords internal
change_file_name_format <- function(file_names, to_format) {
  assertthat::assert_that(
    to_format %in%
      c(
        'ccyyyydl',
        'ccyydl',
        'ccyyyyd',
        'ccyyd',
        'ccyyyyl',
        'ccyyl',
        'ccyyyy',
        'ccyy'
      ),
    msg = "The format passed in argument 'to_format' needs to be one of the following: /n 'ccyyyydl', 'ccyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'"
  )

  current_formats <- purrr::map_chr(file_names, ~ read_file_name_format(.x))

  if (all(current_formats == to_format)) {
    return(file_names)
  }

  assertthat::assert_that(
    purrr::every(current_formats, ~ is_change_possible(.x, to_format)),
    msg = "The change in format of the file name is not possible."
  )

  output_ <- purrr::map2_chr(
    file_names,
    current_formats,
    .f = function(file_name, current_format, to_format) {
      temp_list_information_from_file <- extract_information_from_file_name(
        file_name = file_name,
        current_format
      )

      return(return_desired_format(
        country = temp_list_information_from_file["country_"],
        year = temp_list_information_from_file["year_"],
        database = temp_list_information_from_file["database_"],
        level = temp_list_information_from_file["level_"],
        to_format = to_format
      ))
    },
    to_format
  )

  return(output_)
}


#' Read the format of a file name
#'
#' @param file_name A string with the name of the file.
#'
#' @return A string indicating the format of the file.
#'
#' @keywords internal
read_file_name_format <- function(file_name) {
  if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}[iwe][hp]$")) {
    return("ccyyyydl")
  } else if (
    stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}[iwe][hp]$")
  ) {
    return("ccyydl")
  } else if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}[hp]$")) {
    return("ccyyyyl")
  } else if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}[hp]$")) {
    return("ccyyl")
  } else if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}[iwe]$")) {
    return("ccyyyyd")
  } else if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}[iwe]$")) {
    return("ccyyd")
  } else if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{4}$")) {
    return("ccyyyy")
  } else if (stringr::str_detect(file_name, pattern = "^\\w{2}\\d{2}$")) {
    return("ccyy")
  } else {
    stop(glue::glue("'{file_name}' is not a valid file name format."))
  }
}


#' Is possible to produce the desired format change
#'
#' To be used inside 'change_file_name_format()'
#'
#' @param current_format A string with the currant format.
#' @param to_format A string with the desired format.
#'
#' @return A boolean.
#'
#' @keywords internal
is_change_possible <- function(current_format, to_format) {
  possible_list <- list(
    'ccyyyydl' = c(
      'ccyydl',
      'ccyyyyd',
      'ccyyd',
      'ccyyyyl',
      'ccyyl',
      'ccyyyy',
      'ccyy'
    ),
    'ccyydl' = c(
      'ccyyyydl',
      'ccyyyyd',
      'ccyyd',
      'ccyyyyl',
      'ccyyl',
      'ccyyyy',
      'ccyy'
    ),
    'ccyyyyd' = c('ccyyd', 'ccyyyy', 'ccyy'),
    'ccyyd' = c('ccyyyyd', 'ccyyyy', 'ccyy'),
    'ccyyyyl' = c('ccyyl', 'ccyyyy', 'ccyy'),
    'ccyyl' = c('ccyyyyl', 'ccyyyy', 'ccyy'),
    'ccyyyy' = c('ccyy'),
    'ccyy' = c('ccyyyy')
  )

  return(
    (to_format == current_format) |
      (to_format %in% possible_list[[current_format]])
  )
}


#' Extract information from file name
#'
#' Extracts the information regarding country, year, database and level. The
#'   latter two only if available.
#'
#' To be used inside 'change_file_name_format()'.
#'
#' @param file_name A string with the name of the file.
#' @param current_format A string with the currant format.
#'
#' @return A list with the following elements:
#'   \itemize{
#'    \item 'country_' ('cc')
#'    \item 'year_' ('yyyy')
#'    \item 'database_' ('d')
#'    \item 'level_' ('l')
#'    }
#'
#' @keywords internal
extract_information_from_file_name <- function(file_name, current_format) {
  assertthat::assert_that(
    current_format %in%
      c(
        'ccyyyydl',
        'ccyydl',
        'ccyyyyd',
        'ccyyd',
        'ccyyyyl',
        'ccyyl',
        'ccyyyy',
        'ccyy'
      ),
    msg = "Argument 'current_format' needs to be one of the following: 'ccyyyydl', 'ccyydl', 'ccyyyyd', 'ccyyd', 'ccyyyyl', 'ccyyl', 'ccyyyy', 'ccyy'"
  )

  # extract information from current format
  country_ <- stringr::str_extract(file_name, "^[A-z]{2}")

  ## how many 'y's does 'current_format' have?
  if (stringr::str_count(current_format, "y") == 2) {
    year_ <- stringr::str_match(file_name, "^[A-z]{2}([0-9]{2})")[, 2]

    year_ <- four_digit_year(year_)
  } else if (stringr::str_count(current_format, "y") == 4) {
    year_ <- stringr::str_match(file_name, "^[A-z]{2}([0-9]{4})")[, 2]
  }

  if (stringr::str_detect(current_format, "d")) {
    database_ <- stringr::str_match(
      file_name,
      "^[A-z]{2}[0-9]{2,4}([iwe]{1})"
    )[, 2]
  } else {
    database_ <- ""
  }

  if (stringr::str_detect(current_format, "l")) {
    level_ <- stringr::str_extract(file_name, "[hpr]$")
  } else {
    level_ <- ""
  }

  # assert that no element is NA

  assertthat::assert_that(
    !is.na(country_) && !is.na(year_) && !is.na(database_) && !is.na(level_),
    msg = glue::glue(
      "Some extracted elements are NA. Please make sure that '{file_name}' has format '{current_format}'."
    )
  )

  return(
    list(
      country_ = country_,
      year_ = year_,
      database_ = database_,
      level_ = level_
    )
  )
}


#' Return a file name with the requested format
#'
#' Combines the information passed and returns it in the format stipulated in
#'   argument 'to_format'.
#'
#' To be used inside 'change_file_name_format()'.
#'
#' @param country A string with the country as 'cc'.
#' @param year A string with four digit year as 'yyyy'.
#' @param database A string with database as 'd'.
#' @param level A string with level as 'l'.
#' @param to_format A string with the desired format.
#'
#' @return A string with the file name in the stipulated format.
#'
#' @keywords internal
return_desired_format <- function(country, year, database, level, to_format) {
  if (stringr::str_count(to_format, "y") == 2) {
    year_out <- stringr::str_sub(year, start = -2, end = -1)
  } else if (stringr::str_count(to_format, "y") == 4) {
    year_out <- stringr::str_sub(year, start = -4, end = -1)
  }

  if (stringr::str_detect(to_format, "d")) {
    database_out <- database
  } else {
    database_out <- ""
  }

  if (stringr::str_detect(to_format, "l")) {
    level_out <- level
  } else {
    level_out <- ""
  }

  return(stringr::str_c(country, year_out, database_out, level_out))
}


#' Test if all elements of a vector have the same value
#'
#' @keywords internal
is.all.same.value <- function(x, na.rm = FALSE) {
  assertthat::assert_that(
    rlang::is_vector(x) & !rlang::is_list(x),
    msg = "The element passed to argument 'x' must be a vector and can not be a list."
  )

  if (!na.rm) {
    assertthat::assert_that(
      all(!is.na(x)),
      msg = "'x' can not have NAs if 'na.rm = FALSE'"
    )
  } else if (na.rm & any(is.na(x))) {
    x <- x[!is.na(x)]
  }

  assertthat::assert_that(!is.null(x), msg = "'x' can not be null.")

  assertthat::assert_that(length(x) > 0, msg = "'x' can not have length 0.")

  if (is.numeric(x)) {
    return(zero_range(x)) # avoid problem with floating point
  } else {
    return(length(unique(x)) == 1)
  }
}

# copy pasted from:
# https://stackoverflow.com/questions/4752275/test-for-equality-among-all-elements-of-a-single-numeric-vector
zero_range <- function(x, tol = .Machine$double.eps^0.5) {
  if (length(x) == 1) return(TRUE)
  x <- range(x) / mean(x)
  isTRUE(all.equal(x[1], x[2], tolerance = tol))
}


#' Compute four digit year from two digit one.
#'
#' \lifecycle{experimental}
#' Computes the four digit year from a string with two digits year.
#'
#' To be used within 'extract_information_from_file_name()'.
#'
#' @param two_digit_year An integer or string value with two digits (from 00 to 99).
#' @return A strign with four digit year.
#'
#' @keywords internal
four_digit_year <- function(two_digit_year) {
  assertthat::assert_that(
    stringr::str_detect(two_digit_year, pattern = "^\\d{2}$"),
    msg = "Argument 'two_digit_year' should be two digits (00 to 99)."
  )

  dplyr::if_else(
    dplyr::between(as.numeric(stringr::str_sub(two_digit_year, 1, 2)), 0, 50),
    true = stringr::str_c("20", stringr::str_sub(two_digit_year, 1, 2)),
    false = stringr::str_c("19", stringr::str_sub(two_digit_year, 1, 2))
  )
}


#' Converts 'i', 'w', 'e' into 'LIS', 'LWS' 'ERFLIS'
#'
#' @keywords internal
database_character_to_name <- function(x) {
  x <- tolower(x)
  assertthat::assert_that(
    stringr::str_detect(x, pattern = "[iwe]"),
    msg = glue::glue(
      "'{x}' is not a valid database character. Only 'i', 'w' and 'e' are."
    )
  )

  return(
    switch(EXPR = x, 'i' = "LIS", 'w' = "LWS", 'e' = "ERFLIS")
  )
}


#' Get the database from a list of files
#'
#' Retrieves the 'database' attribute from a list of files.
#'
#' @param lissy_files A list of lissy h-level, p-level or merged files.
#'
#' @keywords internal
get_database <- function(lissy_files) {
  assertthat::assert_that(
    !is.null(attr(lissy_files, "database")),
    msg = "Attribute 'database' is NULL."
  )

  database_ <- attr(lissy_files, "database")

  assertthat::assert_that(
    length(database_) == 1 && !is.null(database_),
    msg = "Attribute 'database' must have length 1."
  )

  assertthat::assert_that(
    database_ %in% c("lis", "lws", "erflis", "i", "w", "e"),
    msg = glue::glue(
      "Only 'lis', 'lws', 'erflis', 'i', 'w' and 'e' are valid values for databases. Got '{database_}'."
    )
  )

  return(database_)
}


#' Get the index of hh-heads
#'
#' @description
#' Returns a logical vector indicating which rows correspond to household
#'   heads.
#'
#' @param file A tibble or data.frame with a LIS or LWS file.
#'
#' @return A logical vector with TRUE for hh-heads and FALSE otherwise.
#' @keywords internal
get_index_hh_heads <- function(file) {
  relation_ <- file[["relation"]]

  if (is.factor(relation_)) {
    return(
      !is.na(relation_) & readr::parse_number(as.character(relation_)) == 1000
    )
  } else {
    return(!is.na(relation_) & relation_ == 1000)
  }
}


#' Is a LISSY machine
#'
#' @description
#' Returns a boolean indicating if the code is running on a LISSY machine
#'
#' @return a boolean.
#' @keywords internal
is_lissy_machine <- function() {
  Sys.info()[["effective_user"]] == "lissy"
}


#' Is Canada LWS on the loaded data ?
#'
#' @description
#' Auxilliary function to remove any p-level Canadian datasets in LWS, from the list of loaded datasets, as it has no weights
#'
#' @return a list
#' @keywords internal
remove_dname_with_missings_in_weights <- function(list, wgt_name) {
  # p-level data from Canada/LWS have missing weights on those whose relation is not 1000 reference person
  # be aware of other datasets that might entail these missings in weights.

  if ( (all(c("relation", "inum") %in% names(list[[1]])) & !is.null(wgt_name)) &&  (sum(stringr::str_detect(names(list), "ca")) > 0)) {
    clean_list <- purrr::discard(list, stringr::str_detect(names(list), "ca"))
    message(
      "Note: Canadian LWS datasets contain missing weights for individuals not identified as the reference person. These datasets were excluded to ensure valid weighted calculations."
    )
    return(clean_list)
  } else {
    clean_list <- list
    return(clean_list)
  }
}


#' Validate the Weight Variable Input
#'
#' @description
#' This function checks the validity of the weight variable provided by the user, issuing a helpful message if the variable name
#' does not end with "wgt", as expected by LIS conventions.
#'
#' @param weight_var A character string of length 1 indicating the name of the weight variable used.
#'
#' @return No return value. The function is used for validation and emits a warning if the weight variable name seems unusual.
#' @keywords internal
check_input_in_weight_argument <- function(wgt_name) {
  if (!is.null(wgt_name) && !stringr::str_detect(wgt_name, "wgt")) {
    warning(
      "LIS advice: Please check whether you have used one of the following variables in the `wgt_name` argument:\n",
      "  - \"hwgt\", \"hpopwgt\", \"hwgta\", \"pwgt\", \"ppopwgt\", or \"pwgta\".\n\n",
      "If your data was loaded at the household level instead of the person level, you may want to generate a multiple of one of these variables, such as `nhhmem * hwgt`."
    )
  }
}


#' Get the year from the `ccyy``
#'
#' @param ccyy A character vector of ccyy's
#'
#' @return A numeric vector with the corresponding years.
#' @keywords internal
ccyy_to_yyyy <- function(ccyy) {
  yy_year <- lissyrtools::datasets %>%
    mutate(yy = stringr::str_sub(dname, 3, 4)) %>%
    select(yy, year) %>%
    unique()

  converter_vector <- yy_year %>% select(year) %>% pull()
  names(converter_vector) <- yy_year %>% select(yy) %>% pull()

  result_yyyy <- unname(converter_vector[stringr::str_sub(ccyy, 3, 4)])

  return(result_yyyy)
}


#' Get the country name from the `ccyy``
#'
#' @param ccyy A character vector of ccyy's
#'
#' @return A character vector with the corresponding country names.
#' @keywords internal
ccyy_to_cname <- function(ccyy) {
  cc_cname <- lissyrtools::datasets %>%
    mutate(cc = stringr::str_sub(dname, 1, 2)) %>%
    select(cc, cname) %>%
    unique()

  converter_vector <- cc_cname %>% select(cname) %>% pull()
  names(converter_vector) <- cc_cname %>% select(cc) %>% pull()

  result_cname <- unname(converter_vector[stringr::str_sub(ccyy, 1, 2)])

  return(result_cname)
}

#' Rearranges a list whose elements are ccyy's into country names with numeric vectors names after its years.
#'
#' @param data_list A list
#'
#' @return A list
#' @keywords internal
convert_list_from_ccyy_to_cc_names_yyyy <- function(data_list) {
  data_list  %>%
    purrr::imap_dfr(~{
      tibble(
        country = ccyy_to_cname(.y),
        year = ccyy_to_yyyy(.y),
        value = .x
      )
    }) %>%
    split(.$country) %>%
    purrr::map(~ setNames(.x$value, .x$year))
}


#' Checks the share of missing values for each column acroos a list of data frames.
#'
#' @param data_list A list
#' @param include_zeros Logical. If `TRUE`, values equal to zero are also accounted in the percentage of missings.
#'
#' @return A list
#' @keywords internal
check_missing_share <- function(data_list, include_zeros = FALSE) {
  purrr::map(data_list, function(df) {
    summarise(df, across(everything(), ~ {
      # Define missing: NA or (optional) zero
      missing_vals <- if (include_zeros) is.na(.) | . == 0 else is.na(.)
      mean(missing_vals) * 100
    }))
  })
}