R/prepare_exposure.R
In healthiar: Quantifying and Monetizing Health Impacts Attributable to Exposure

Documented in prepare_exposure

#' Prepare exposure data

# DESCRIPTION ##################################################################
#' @description
#' This function prepares tabular population exposure data compatible with the \code{attribute()} and \code{compare()} functions,
#' based on gridded pollution concentration data and polygon data representing geographic units.
#' If population data is provided, the function calculates an average concentration value in each geographic unit
#' that is weighted with the population number at each location.
#' If no population data is provided, the function calculates the simple spatial average concentration in each geographic unit.

# ARGUMENTS ####################################################################
#' @param poll_grid \code{SpatRaster} of the pollution concentration data.
#' @param geo_units \code{sf} of the geographic units or sub-units.
#' @param population \code{Integer vector} of the total population number in each geographic sub-unit.
#' @param pop_grid \code{SpatRaster} of the gridded population data.
#' @param geo_id_micro \code{Numeric or string vector} of the IDs of the geographic units. Required if \code{pop_grid} is given or if no population data is provided.
#' @param geo_id_macro \code{Numeric or string vector} of the higher-level IDs of the geographic units the sub-unit belong to and will be aggregated at. Required if \code{population} is provided.
#' @param bin_width \code{Numeric} specifying the width of the population exposure bins.

# VALUE ########################################################################
#' @return
#' This function returns a \code{list} containing:
#' @returns
#' 1) \code{main} (\code{list}) containing the main results as vectors;
#' \itemize{
#'  \item \code{geo_id_micro} of \code{geo_id_macro} (\code{string} column) containing the (higher-level) geographic IDs of the assessment
#'  \item \code{exposure_mean} (\code{numeric} column) containing the (population-weighted) mean exposure
#'  \item \code{population_total} (\code{integer} column) containing the total population in each geographic unit, if population data was provided
#' }
#' @returns
#' 2) \code{detailed} (\code{list}) containing detailed (and interim) results.

# EXAMPLES #####################################################################
#' @examples
#' # Goal: determine population-weighted mean PM2.5 exposure for several
#' # neighborhoods of Brussels (Belgium)
#'
#' path <- system.file("extdata", "exdat_pwm_1.tif", package = "healthiar")
#' exdat_pwm_1 <- terra::rast(path)
#'
#' pwm <- prepare_exposure(
#'   poll_grid = exdat_pwm_1, # Formal class SpatRaster
#'   geo_units = exdat_pwm_2, # sf of the geographic sub-units
#'   population = sf::st_drop_geometry(exdat_pwm_2$population), # population per geographic sub-unit
#'   geo_id_macro = sf::st_drop_geometry(exdat_pwm_2$region) # higher-level IDs to aggregate at
#' )
#'
#' pwm$exposure_main # population-weighted mean exposures for the (higher-level) geographic units

#' @export

#' @author Arno Pauwels & Liliana Vazquez Fernandez

prepare_exposure <-
  function(
    poll_grid,
    geo_units,
    population = NULL,
    pop_grid = NULL,
    geo_id_micro = NULL,
    geo_id_macro = NULL,
    bin_width = 0.1
  ) {
    ## check required packages
    if (!requireNamespace("terra", quietly = TRUE)) {
      stop("The 'terra' package is required for this function. Please install it if you want to use this function.", call. = FALSE)}
    if (!requireNamespace("sf", quietly = TRUE)) {
      stop("The 'sf' package is required for this function. Please install it if you want to use this function.", call. = FALSE)}
    if (!requireNamespace("exactextractr", quietly = TRUE)) {
      stop("The 'exactextractr' package is required for this function. Please install it if you want to use this function.", call. = FALSE)}

    ## calculate exposure as a simple average concentration
    if (base::is.null(population) & base::is.null(pop_grid)) {

      ## check for matching CRS
      if (sf::st_crs(geo_units) != sf::st_crs(poll_grid)) {
        geo_units <- sf::st_transform(geo_units, sf::st_crs(poll_grid))
        warning("'geo_units' was reprojected to match the CRS of 'poll_grid'.")}

      ## crop & mask pollution grid
      poll_grid <- terra::mask(terra::crop(poll_grid, terra::vect(geo_units)), terra::vect(geo_units))

      ## rename pollution grid
      base::names(poll_grid) <- "poll"

      ## calculate mean concentration value and other stats by geographical unit
      exp_mean <- base::data.frame(
        geo_id_micro = geo_id_micro,
        mean = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "mean",
          progress = FALSE
        ),
        median = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "median",
          progress = FALSE
        ),
        min = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "min",
          progress = FALSE
        ),
        lower = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "quantile",
          quantiles = 0.025,
          progress = FALSE
        ),
        upper = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "quantile",
          quantiles = 0.975,
          progress = FALSE
        ),
        max = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "max",
          progress = FALSE
        ),
        stdev = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "stdev",
          progress = FALSE
        )
      )

      ## build output lists
      exposure_main <- base::list(
        geo_id_micro = exp_mean$geo_id_micro,
        exposure_mean = exp_mean$mean
      )

      exposure_detailed <- base::list(
        geo_id_micro = exp_mean$geo_id_micro,
        exposure_mean = exp_mean$mean,
        exposure_median = exp_mean$median,
        exposure_min = exp_mean$min,
        exposure_lower = exp_mean$lower,
        exposure_upper = exp_mean$upper,
        exposure_max = exp_mean$max,
        exposure_stdev = exp_mean$stdev
      )

      out <- base::list(
        exposure_main = exposure_main,
        exposure_detailed = exposure_detailed
      )

      return(out)
    }

    ## calculate exposure as a population-weighted average concentration based on gridded population
    if (!base::is.null(pop_grid)) {

      ## check for matching CRS
      if (terra::ext(pop_grid) != terra::ext(poll_grid)) {
        poll_grid <- terra::project(poll_grid, pop_grid, method = "near")
        warning("'poll_grid' was reprojected to match the extent and resolution of 'pop_grid'.")}
      if (sf::st_crs(geo_units) != sf::st_crs(poll_grid)) {
        geo_units <- sf::st_transform(geo_units, st_crs(poll_grid))
        warning("'geo_units' was reprojected to match the CRS of 'poll_grid' and 'pop_grid'.")}

      ## crop & mask pollution & population grid
      poll_grid <- terra::mask(terra::crop(poll_grid, terra::vect(geo_units)), terra::vect(geo_units))
      pop_grid <- terra::mask(terra::crop(pop_grid, terra::vect(geo_units)), terra::vect(geo_units))

      ## extract min and max value
      poll_min <- base::min(terra::values(poll_grid), na.rm = TRUE)
      poll_max <- base::max(terra::values(poll_grid), na.rm = TRUE)

      ## define bins
      decimals = base::round(-base::log10(bin_width))
      bin_min <- base::round(poll_min, decimals)
      bin_max <- base::round(poll_max, decimals)
      bins <- base::data.frame(
        bin = base::cut(
          x = base::seq(bin_min, bin_max-bin_width, by = bin_width),
          breaks = base::seq(bin_min, bin_max, by = bin_width),
          right = FALSE
        ),
        mid = base::seq(bin_min, bin_max-bin_width, by = bin_width) + (bin_width/2)
      )

      ## bind pollution and population grids
      grid <- base::c(poll_grid, pop_grid)
      base::names(grid) <- base::c("poll", "pop")

      ## extract grid values by geographical unit
      geo_units$geo_id_micro <- geo_id_micro
      exp_vals <- exactextractr::exact_extract(
        grid,
        geo_units,
        include_cols = "geo_id_micro",
        progress = FALSE
      )

      ## get population by exposure bin
      exp_bins <- purrr::map_dfr(exp_vals, function(df) {
        df |>
          # 1. Calculate weighted population
          dplyr::mutate(pop = coverage_fraction * pop) |>
          # 2. Create bins for pollutant levels
          dplyr::mutate(bin = base::cut(
            poll,
            base::seq(bin_min, bin_max, by = bin_width),
            right = FALSE
          )) |>
          # 3. Aggregate population by bin
          dplyr::group_by(bin) |>
          dplyr::summarise(
            pop = base::sum(pop, na.rm = TRUE),
            .groups = "drop"
          ) |>
          # 4. Join with master 'bins' table to ensure all bins are represented
          dplyr::left_join(bins, by = "bin") |>
          # 5. Add back the geo_id and fill empty bins with 0
          dplyr::mutate(
            geo_id_micro = base::unique(df$geo_id_micro),
            pop = dplyr::coalesce(pop, 0)
          )
      })


      ## get population-weighted average
      exp_mean <- purrr::map_dfr(exp_vals, function(df) {
        df |>
          # 1. Update population by coverage fraction
          dplyr::mutate(pop = coverage_fraction * pop) |>
          # 2. Calculate weighted mean and total population
          dplyr::summarise(
            geo_id_micro = base::unique(geo_id_micro),
            mean = stats::weighted.mean(poll, pop, na.rm = TRUE),
            pop = base::round(base::sum(pop, na.rm = TRUE)),
            .groups = "drop"
          )
      })

      ## build output lists
      exposure_main <- base::list(
        geo_id_micro = exp_mean$geo_id_micro,
        exposure_mean = exp_mean$mean,
        population_total = exp_mean$pop
      )

      exposure_detailed <- base::list(
        geo_id_micro = exp_bins$geo_id_micro,
        exposure_bin = exp_bins$bin,
        exposure_mid = exp_bins$mid,
        population = exp_bins$pop
      )

      out <- base::list(
        exposure_main = exposure_main,
        exposure_detailed = exposure_detailed
      )

      return(out)
    }

    ## calculate exposure as a population-weighted average concentration based on population in sub-units
    if (!base::is.null(population)) {

      ## check for matching CRS
      if (sf::st_crs(geo_units) != sf::st_crs(poll_grid)) {
        geo_units <- sf::st_transform(geo_units, st_crs(poll_grid))
        warning("'geo_units' was reprojected to match the CRS of 'poll_grid' and 'population'.")}

      ## crop & mask pollution grid
      poll_grid <- terra::mask(terra::crop(poll_grid, terra::vect(geo_units)), terra::vect(geo_units))

      ## extract min and max value
      poll_min <- base::min(terra::values(poll_grid), na.rm = TRUE)
      poll_max <- base::max(terra::values(poll_grid), na.rm = TRUE)

      ## define bins
      decimals = base::round(-base::log10(bin_width))
      bin_min <- base::round(poll_min, decimals)
      bin_max <- base::round(poll_max, decimals)
      bins <- base::data.frame(
        bin = base::cut(
          x = base::seq(bin_min, bin_max-bin_width, by = bin_width),
          breaks = base::seq(bin_min, bin_max, by = bin_width),
          right = FALSE
        ),
        mid = base::seq(bin_min, bin_max-bin_width, by = bin_width) + (bin_width/2)
      )

      ## extract pollution mean by geographical sub-unit
      exp_vals <- base::data.frame(
        geo_id_macro = geo_id_macro,
        pop = population,
        poll = exactextractr::exact_extract(
          poll_grid,
          geo_units,
          fun = "mean",
          progress = FALSE
        )
      )

      ## get population by exposure bin
      exp_bins <- exp_vals |>
        # 1. Create bins for the whole dataset at once (Fastest)
        dplyr::mutate(
          bin = base::cut(
            poll,
            base::seq(bin_min, bin_max, by = bin_width),
            right = FALSE
          )
        ) |>
        # 2. Aggregate by ID and Bin
        dplyr::group_by(geo_id_macro, bin) |>
        dplyr::summarise(
          pop = base::sum(pop, na.rm = TRUE),
          .groups = "drop"
        ) |>
        # 3. Join with a grid of ALL IDs and ALL Bins (from your master 'bins' table)
        # This ensures 'mid' and any other bin metadata are included
        dplyr::right_join(
          tidyr::expand_grid(
            geo_id_macro = base::unique(exp_vals$geo_id_macro),
            bin = bins$bin
          ) |>
            dplyr::left_join(bins, by = "bin"), # This brings 'mid' back in
          by = base::c("geo_id_macro", "bin")
        ) |>
        # 4. Cleanup NAs
        dplyr::mutate(pop = dplyr::coalesce(pop, 0))

      ## get population-weighted average
      exp_mean <- exp_vals |>
        dplyr::group_by(geo_id_macro) |>
        dplyr::summarise(
          mean = stats::weighted.mean(poll, pop),
          pop = base::sum(pop)
        )

      ## build output lists
      exposure_main <- base::list(
        geo_id_macro = exp_mean$geo_id_macro,
        exposure_mean = exp_mean$mean,
        population_total = exp_mean$pop
      )

      exposure_detailed <- base::list(
        geo_id_macro = exp_bins$geo_id_macro,
        exposure_bin = exp_bins$bin,
        exposure_mid = exp_bins$mid,
        population = exp_bins$pop
      )

      out <- base::list(
        exposure_main = exposure_main,
        exposure_detailed = exposure_detailed
      )

      return(out)
    }
}