R/idmc_transform_daily.R

Defines functions idmc_transform_daily

Documented in idmc_transform_daily

#' Transform displacement event data to daily data
#'
#' `idmc_transform_daily()` transforms event data from the IDMC API (accessed
#' through [idmc_get_data()]). For each event, identified by an `event_id`,
#' potentially duplicated data is filtered out. If there are `Recommended figure`
#' rows based on the `role` column, then only those are kept. If there are no
#' recommended figures, then only the latest update to the `event_id` data is
#' kept for each `locations_coordinates`, using `created_at` to find latest updates.
#'
#' The data for each event is spread out between
#' the minimum start and maximum end date for each `event?id`,
#' with the total displacement uniformly distributed across all days.
#' For each country and displacement type (conflict, disaster,
#' or other), all displacement on a day is summed up to create a total
#' daily displacement figure.
#'
#' By default, data is backfilled for all countries and displacement types to
#' the first reported date in the IDMC dataset. Data is always infilled with 0
#' between start and end dates.
#'
#' @param df Event displacement data frame, generated from [idmc_get_data()].
#' @param min_date Date to backfill displacement data to. By default, `min_date`
#'     is set the first day of 2018. Only a few observations of the IDMC data
#'     are from before 2018, spanning back to 2011.
#'     If `NULL`, no backfilling is done, and the first reported
#'     case in the IDMC database is taken as the earliest.
#' @param max_date Date to extrapolate all data to, filling with `0`. If the
#'     The latest date in the data frame is used if later than `max_date`.
#'     If `NULL`, no extrapolation is done.
#' @param filter_min_date If `TRUE`, the default, filters the data to only
#'     contain data from `min_date` onward. Ensures that the few countries with
#'     observations from 2011 but nothing until 2018 do not skew results.
#'
#'
#' @returns Data frame of daily displacement with the following columns:
#' \describe{
#'   \item{iso3}{Country ISO3 code.}
#'   \item{country}{Country or area name.}
#'   \item{displacement_type}{Type of displacement.}
#'   \item{date}{Date.}
#'   \item{displacement_daily}{Daily level of displacement.}
#' }
#'
#' @examplesIf !is.na(Sys.getenv("IDMC_API", unset = NA))
#' idmc_get_data() %>%
#'   idmc_transform_daily()
#'
#' @export
idmc_transform_daily <- function(
    df,
    min_date = as.Date("2018-01-01"),
    max_date = Sys.Date(),
    filter_min_date = TRUE
  ) {
  # date columns to transform
  s_col_ <- "displacement_start_date"
  e_col_ <- "displacement_end_date"

  # columns to group by
  group_cols <- c("iso3", "country", "displacement_type", "date")

  # raise error if required columns not in input data frame
  req_cols <- c(
    s_col_,
    e_col_,
    "displacement_date",
    "figure",
    "locations_coordinates",
    group_cols[-4]
  )

  assert_df_cols(
    df = df,
    cols = req_cols,
    derived_from = "idmc_get_data()"
  )

  # filter to rows with Recommended figure if they exist
  df_recommended_figures <- df %>% dplyr::group_by(.data$event_id) |>
    dplyr::filter(
      .data$role == "Recommended figure")

  # if no recommended figure, take latest triangulation figure for each event_id and location
  df_triangulation_figures_same_location <- df %>%
    dplyr::filter((.data$role == "Triangulation")& !(.data$event_id %in% df_recommended_figures$event_id )) %>%
    dplyr::mutate(created_at = as.POSIXct(.data$created_at)) %>%
    dplyr::group_by(.data$event_id, .data$locations_coordinates) %>%
    dplyr::slice_max(order_by = .data$created_at, n = 1, with_ties = FALSE)

  # if no recommended figure, and multiple locations for an event_id,
  # sum figures and take:
  # minimum `displacement_start_date` for each `event_id`
  # maximum `displacement_end_date` for each `event_id`
  # latest created_at to have a single entry per event_id
  df_triangulation_figures_different_location <- df_triangulation_figures_same_location %>%
    dplyr::group_by(.data$event_id, .data$iso3)  %>%
    dplyr::mutate(figure = sum(.data$figure, na.rm = TRUE), displacement_start_date=min(.data$displacement_start_date), displacement_end_date=max(.data$displacement_end_date)) %>%  # sum within event_id
    dplyr::slice_max(.data$created_at, n = 1, with_ties = FALSE) %>% # keep latest row
    dplyr::ungroup()

  df_combined <- dplyr::bind_rows(
    df_recommended_figures,
    df_triangulation_figures_different_location)

  df_daily <- df_combined %>% dplyr::rowwise() %>%
    dplyr::mutate(
      date = if (!is.na(.data$displacement_start_date) && !is.na(.data$displacement_end_date)) {
        list(seq(.data$displacement_start_date, .data$displacement_end_date, by = "day"))
      } else {
        list(as.Date(character()))  # empty sequence instead of NA
      },
      displacement_daily = if (!is.na(.data$displacement_start_date) && !is.na(.data$displacement_end_date)) {
        .data$figure / length(date)
      } else {
        NA_real_
      }
    ) %>%
    dplyr::ungroup() %>%
    tidyr::unnest(
      cols = "date"
    ) %>%
    dplyr::group_by(
      dplyr::across(!!group_cols)
    ) %>%
    dplyr::summarize(
      displacement_daily = sum(.data$displacement_daily),
      .groups = "drop"
    )

  n_missing_dates <- df_combined %>%
    dplyr::filter(is.na(.data$displacement_start_date) | is.na(.data$displacement_end_date)) %>%
    nrow()
  if (n_missing_dates > 0) {
    warning(glue::glue("{n_missing_dates} rows dropped because of missing start or end dates."))
  }
  # replace with NA so that no backfilling or extrapolation occurs

  if (is.null(max_date)) {
    max_date <- NA
  }

  if (is.null(min_date)) {
    min_date <- NA
  }

  # completing as necessary
  df_complete <- df_daily %>%
    dplyr::group_by(
      dplyr::across(
        !!group_cols[-4] # don't include date
      )
    ) %>%
    tidyr::complete(
      date = seq(min(.data$date, min_date, na.rm = TRUE), max(.data$date, max_date, na.rm = TRUE), by = "day"),
      fill = list(
        displacement_daily = 0
      )
    )

  # filter to min date if necessary
  if (filter_min_date) {
    df_complete <- df_complete %>%
      dplyr::filter(
        .data$date >= !!min_date
      )
  }

  dplyr::ungroup(df_complete)
}

Try the idmc package in your browser

Any scripts or data that you put into this service are public.

idmc documentation built on Feb. 6, 2026, 1:07 a.m.