R/utils.R

Defines functions get_EUmap get_population_data missing_intra check_missing_ids summarise_data

utils::globalVariables(c(".", "NUTS_CODES", "NUTS_ID", "CNTR_CODE", "CNTR_ID",
                         "NAME_ENGL"))
#' Data preparation
#'
#' @param data data.frame
#' @param select_period A vector specifying the time periods to be selected (from time_period column).
#' By default, it is set to NULL, meaning all periods in the data frames will be considered.
#' @param reporter Logical. If TRUE (default) values are grouped by reporter.
#' @param partner Logical. If TRUE values are grouped by partner (default is FALSE).
#'
#' @return A data frame with the values grouped by ID for each selected time period.
#' ID corresponds to reporter if TRUE or to partner if TRUE.
#' If reporter and partner are TRUE the data frame contains both columns.
#' 
#' @noRd
#' @keywords internal
summarise_data <- function(data, select_period = NULL, reporter = TRUE, partner = FALSE) {
  time_period <- value <- NULL
  data <- data %>%
    filter(if (!is.null(select_period)) time_period %in% select_period else TRUE)
  if(any(is.na(data$value))){
    message("Note: The input data contains missing values, these will be considered as zeros.")
  }
  if(!reporter & !partner){
    stop("At least one of the arguments 'reporter' or 'partner' must be TRUE.")
  }else if(reporter & !partner){
    data <- data %>%
      group_by(reporter, time_period) %>%
      summarise(value = sum(value, na.rm = TRUE), .groups = "keep") %>%
      rename(IDs = reporter)
  }else if(!reporter & partner){
    data <- data %>%
      group_by(partner, time_period) %>%
      summarise(value = sum(value, na.rm = TRUE), .groups = "keep") %>%
      rename(IDs = partner)
  }else{
    data <- data %>%
      group_by(reporter, partner, time_period) %>%
      summarise(value = sum(value, na.rm = TRUE), .groups = "keep")
  }
  return(data)
}

#' Identify and IDs without available data
#'
#' This internal function checks for time periods within the specified 
#' data frame where the `total_available` values are zero. It prints a warning 
#' message for each ID that has zero values in the selected time periods, 
#' advising the user to select other time periods if they want to include 
#' these IDs in the analysis. The function then returns a vector of IDs that 
#' have zero values in the specified time periods.
#'
#' @param df1 A data frame containing columns `reporter`and `time_period`
#' 
#' @param df2 A data frame containing columns `reporter`and `time_period`
#' 
#' @param df3 A data frame containing columns `reporter`and `time_period`
#' 
#' @param IDs A vector specifying the time periods to be selected. 
#' 
#' @param time_period A vector specifying the time periods to be selected. 
#'
#' @return A vector of IDs that have zero values
#' 
#' @noRd
#' @keywords internal
check_missing_ids <- function(df1, df2, df3, IDs, time_period) {
  ID <- tps <- NULL
  missing_info <- data.frame(ID = character(), 
                             tp = integer(), 
                             stringsAsFactors = FALSE)
  for (tp in time_period) {
    for (id in IDs) {
      id_in_df1 <- any(df1$reporter == id & df1$time_period == tp)
      id_in_df2 <- any(df2$reporter == id & df2$time_period == tp)
      id_in_df3 <- any(df3$reporter == id & df3$time_period == tp & df3$value != 0)
      
      if (!id_in_df1 & !id_in_df2 & !id_in_df3) {
        missing_info <- rbind(missing_info, data.frame(ID = id, tp = tp, stringsAsFactors = FALSE))
      }
    }
  }
  
  if (nrow(missing_info) > 0) {
    warning_messages <- missing_info %>%
      group_by(ID) %>%
      summarise(tps = paste(unique(tp), collapse = ", ")) %>%
      mutate(message = paste(ID, "in time periods", tps)) %>%
      pull(message)
    
    missing_IDs <- unique(missing_info$ID)
    warning_message <- 
      paste("Warning: No available data for:", paste(warning_messages, collapse = "; "),
            ". Therefore,", paste(missing_IDs, collapse = ", "), 
            paste(strwrap("will be excluded from the analysis.\nPlease select other 
                          time periods if you want to include"), collapse=" "),
            paste(missing_IDs, collapse = ", "), "in the analysis.")

  }else{
    missing_IDs <- c()
    warning_message <- c()
  }
  
  return(list(missing_IDs = missing_IDs,
              warning_message = warning_message))
}

#' Identify and fill missing intra-partner combinations in a data frame
#'
#' This internal function identifies missing intra-partner combinations in a given
#' data frame and fills them with zero values. It is designed to be used within other functions.
#'
#' @param data A data frame containing partner and reporter columns.
#' @param IDs A vector specifying the IDs used for generating combinations.
#'
#' @return A data frame with missing intra-partner combinations filled with zero values.
#' 
#' @noRd
#' @keywords internal
missing_intra <- function(data, IDs) {
  IDi <- IDj <- NULL
  df <- data.frame(IDi = IDs) %>%
    expand(IDi=IDi, IDj=IDi) %>%
    filter(IDj!=IDi)
  if (!all(paste(df$IDi, df$IDj) %in% paste(data$partner, data$reporter))) {
    missing_combinations <- anti_join(df, data[,c("partner", "reporter")],
                                      by = c("IDi"="partner",
                                             "IDj"="reporter"))
    missing_combinations$value <- 0
    data <- data %>%
      bind_rows(missing_combinations %>% rename(partner = IDi, reporter=IDj))
  }
  return(data)
}


#' Human population data
#' 
#' This internal function download Eurostat human population data
#'
#' @param nuts_level Numeric, NUTS level to obtain human population data
#' @param nuts_filter Character, NUTS codes to filter
#'
#' @return A data frame with human population data for each NUTS level
#' 
#' @noRd
#' @keywords internal
get_population_data <- function(nuts_level, nuts_filter=NULL) {
  sex <- age <- NULL
  length_nuts <- nuts_level + 2 # NUTS characters
  df <- tryCatch({
    eurostat::get_eurostat("demo_r_pjangrp3", time_format = "num") %>%
      filter(
        sex == "T" &
          unit == "NR" &
          age == "TOTAL" &
          nchar(geo) == length_nuts
      )
  }, error = function(e) {
    e$message <- paste(
      "Failed to download human population data from Eurostat.\n",
      "Reason: ", e$message, "\n",
      "This may be due to the data source being temporarily unavailable.\n",
      "Sorry for the inconvenience. You can check the available datasets at:\n",
      "https://ec.europa.eu/eurostat/web/main/data/database"
    ) 
    stop(e)
  })

  if(!is.null(nuts_filter)){
    df <- df %>% filter(geo %in% nuts_filter)
  }

  return(df)
}

#' Cached function to memoize the retrieval of Eurostat data.
#'
#' This function uses memoization to cache the results of `get_population_data`,
#' avoiding repeated downloads of the data.
#'
#' @details
#' The function `cached_get_eurostat_data` memoizes the `get_population_data`
#' function using `memoise::memoise`, ensuring that the data is retrieved
#' from Eurostat only once and subsequent calls retrieve the cached results.
#'
#' @return A memoized version of `get_population_data`.
#' 
#' @noRd
#' @keywords internal
cached_get_eurostat_data <- memoise::memoise(get_population_data)

# EU map (from giscoR pkg)
get_EUmap <- function(year, nuts) {
  suppressWarnings(
    suppressMessages(
      giscoR::gisco_get_nuts(year = year, nuts_level = nuts) %>%
        select(NUTS_ID, CNTR_CODE) %>% 
        left_join(select(
          giscoR::gisco_get_countries() %>% st_drop_geometry(), 
          CNTR_ID, NAME_ENGL),
          by = join_by(CNTR_CODE == CNTR_ID)) %>% 
        rename(CNTR_NAME = NAME_ENGL)
    ))
}

cached_get_EUmap <- memoise::memoise(get_EUmap)

Try the qPRAentry package in your browser

Any scripts or data that you put into this service are public.

qPRAentry documentation built on April 12, 2025, 1:12 a.m.