R/gapfilling.R
In whep: Processing Agro-Environmental Data

Documented in linear_fill proxy_fill sum_fill

# Simple functions to fill gaps (NA values) in a time-dependent variable,
# creating complete time series.

#' Fill gaps by linear interpolation, or carrying forward or backward.
#'
#' @description
#' Fills gaps (`NA` values) in a time-dependent variable by
#' linear interpolation between two points, or carrying forward or backwards
#' the last or initial values, respectively. It also creates a new variable
#' indicating the source of the filled values.
#'
#' @param df A tibble data frame containing one observation per row.
#' @param var The variable of df containing gaps to be filled.
#' @param time_index The time index variable (usually year).
#' @param interpolate Logical. If `TRUE` (default),
#'   performs linear interpolation.
#' @param fill_forward Logical. If `TRUE` (default),
#'   carries last value forward.
#' @param fill_backward Logical. If `TRUE` (default),
#'   carries first value backward.
#' @param .by A character vector with the grouping variables (optional).
#'
#' @return A tibble data frame (ungrouped) where gaps in var have been filled,
#'   and a new "source" variable has been created indicating if the value is
#'   original or, in case it has been estimated, the gapfilling method that has
#'   been used.
#'
#' @export
#'
#' @examples
#' sample_tibble <- tibble::tibble(
#'   category = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"),
#'   year = c(
#'     "2015", "2016", "2017", "2018", "2019", "2020",
#'     "2015", "2016", "2017", "2018", "2019", "2020"
#'   ),
#'   value = c(NA, 3, NA, NA, 0, NA, 1, NA, NA, NA, 5, NA),
#' )
#' linear_fill(sample_tibble, value, year, .by = c("category"))
#' linear_fill(
#'   sample_tibble,
#'   value,
#'   year,
#'   interpolate = FALSE,
#'   .by = c("category"),
#' )
linear_fill <- function(
  df,
  var,
  time_index,
  interpolate = TRUE,
  fill_forward = TRUE,
  fill_backward = TRUE,
  .by = NULL
) {
  df |>
    dplyr::mutate(
      # relative to first/last non-NA
      place = dplyr::case_when(
        !cummax(!is.na({{ var }})) ~ "left",
        rev(!cummax(rev(!is.na({{ var }})))) ~ "right",
        .default = "middle"
      ),
      fill_value = dplyr::case_when(
        place == "left" & fill_backward ~
          zoo::na.locf0({{ var }}, fromLast = TRUE),
        place == "right" & fill_forward ~
          zoo::na.locf0({{ var }}, fromLast = FALSE),
        place == "middle" & interpolate ~
          zoo::na.approx(
            {{ var }},
            x = {{ time_index }},
            na.rm = FALSE
          ),
        .default = NA_real_
      ),
      fill_value = dplyr::coalesce({{ var }}, fill_value),
      "source_{{var}}" := dplyr::case_when(
        !is.na({{ var }}) ~ "Original",
        place == "left" & !is.na(fill_value) ~ "First value carried backwards",
        place == "right" & !is.na(fill_value) ~ "Last value carried forward",
        place == "middle" & !is.na(fill_value) ~ "Linear interpolation",
        TRUE ~ "Gap not filled"
      ),
      "{{var}}" := fill_value,
      place = NULL,
      fill_value = NULL,
      .by = dplyr::all_of(.by)
    )
}

#' Fill gaps using a proxy variable
#'
#' @description
#' Fills gaps in a variable based on changes in a proxy variable, using ratios
#' between the filled variable and the proxy variable, and labels output
#' accordingly.
#'
#' @param df A tibble data frame containing one observation per row.
#' @param var The variable of df containing gaps to be filled.
#' @param proxy_var The variable to be used as proxy.
#' @param time_index The time index variable (usually year).
#' @param ... Optionally, additional arguments that will be passed to
#'   `linear_fill()` with the ratios. See that function to know the accepted
#'   arguments.
#'
#' @return A tibble dataframe (ungrouped) where gaps in var have been filled,
#'   a new proxy_ratio variable has been created,
#'   and a new "source" variable has been created indicating if the value is
#'   original or, in case it has been estimated, the gapfilling method that has
#'   been used.
#'
#' @export
#'
#' @examples
#' sample_tibble <- tibble::tibble(
#'   category = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"),
#'   year = c(
#'     "2015", "2016", "2017", "2018", "2019", "2020",
#'     "2015", "2016", "2017", "2018", "2019", "2020"
#'   ),
#'   value = c(NA, 3, NA, NA, 0, NA, 1, NA, NA, NA, 5, NA),
#'   proxy_variable = c(1, 2, 2, 2, 2, 2, 1, 2, 3, 4, 5, 6)
#' )
#' proxy_fill(sample_tibble, value, proxy_variable, year, .by = c("category"))
proxy_fill <- function(df, var, proxy_var, time_index, ...) {
  df |>
    dplyr::mutate(proxy_ratio = {{ var }} / {{ proxy_var }}) |>
    linear_fill(proxy_ratio, {{ time_index }}, ...) |>
    dplyr::mutate(
      "source_{{var}}" := dplyr::case_when(
        !is.na({{ var }}) ~ "Original",
        source_proxy_ratio == "Linear interpolation" ~ "Proxy interpolated",
        source_proxy_ratio == "Last value carried forward" ~
          "Proxy carried forward",
        source_proxy_ratio == "First value carried backwards" ~
          "Proxy carried backwards",
        .default = NA_character_
      ),
      "{{var}}" := dplyr::coalesce({{ var }}, proxy_ratio * {{ proxy_var }})
    )
}

#' Fill gaps summing the previous value of a variable to the value of
#' another variable.
#'
#' @description
#' Fills gaps in a variable with the sum of its previous value and the value
#' of another variable. When a gap has multiple observations, the values are
#' accumulated along the series. When there is a gap at the start of the
#' series, it can either remain unfilled or assume an invisible 0 value before
#' the first observation and start filling with cumulative sum.
#'
#' @param df A tibble data frame containing one observation per row.
#' @param var The variable of df containing gaps to be filled.
#' @param change_var The variable whose values will be used to fill the gaps.
#' @param start_with_zero Logical. If TRUE, assumes an invisible 0 value before
#'   the first observation and fills with cumulative sum starting from the first
#'   change_var value. If FALSE (default), starting NA values remain unfilled.
#' @param .by A character vector with the grouping variables (optional).
#'
#' @return A tibble dataframe (ungrouped) where gaps in var have been filled,
#' and a new "source" variable has been created indicating if the value is
#' original or, in case it has been estimated, the gapfilling method that has
#' been used.
#'
#' @export
#'
#' @importFrom stats ave
#'
#' @examples
#' sample_tibble <- tibble::tibble(
#'   category = c("a", "a", "a", "a", "a", "a", "b", "b", "b", "b", "b", "b"),
#'   year = c(
#'     "2015", "2016", "2017", "2018", "2019", "2020",
#'     "2015", "2016", "2017", "2018", "2019", "2020"
#'   ),
#'   value = c(NA, 3, NA, NA, 0, NA, 1, NA, NA, NA, 5, NA),
#'   change_variable = c(1, 2, 3, 4, 1, 1, 0, 0, 0, 0, 0, 1)
#' )
#' sum_fill(
#'   sample_tibble,
#'   value,
#'   change_variable,
#'   start_with_zero = FALSE,
#'   .by = c("category")
#' )
#' sum_fill(
#'   sample_tibble,
#'   value,
#'   change_variable,
#'   start_with_zero = TRUE,
#'   .by = c("category")
#' )
sum_fill <- function(
  df,
  var,
  change_var,
  start_with_zero = TRUE,
  .by = NULL
) {
  df |>
    dplyr::mutate(
      groups = cumsum(!is.na({{ var }})),
      prefilled = dplyr::coalesce({{ var }}, {{ change_var }}),
      source_value = ifelse(is.na({{ var }}), "Filled with sum", "Original"),
      "{{ var }}" := ave(prefilled, groups, FUN = cumsum),
      "{{ var }}" := if (start_with_zero) {{ var }} else {
        ifelse(groups == 0, NA, {{ var }})
      },
      source_value = ifelse(is.na({{ var }}), NA_character_, source_value),
      groups = NULL,
      prefilled = NULL,
      .by = dplyr::all_of(.by)
    )
}