R/filters.R

Defines functions filter_memberships handle_date filter_dates

Documented in filter_dates filter_memberships handle_date

### Filter functions

# Filter dates ----------------------------------------------------------------

#' Filter a tibble of data based on the given from and to dates.
#'
#' \code{filter_dates} takes a tibble which contains data on a time bound
#' activity and returns the subset of rows where that activity took place
#' within a given period. The tibble must contain two columns of Date objects,
#' which record the start and end dates of an activity. The from and to dates
#' provided are used to find all rows where some part of the period
#' of activity took place within the period of filtering. The filtering
#' process is inclusive: as long as at least one day of activity falls within
#' the filtering period, the row is returned.
#'
#' @param df A tibble containing data on a time bound activity.
#' @param start_col The name of the column that contains the start date for
#'   the activity.
#' @param end_col The name of the column that contains the end date for the
#'   activity.
#' @param from_date A string or Date representing a date. If a string is used
#'   it should specify the date in ISO 8601 date format e.g. '2000-12-31'. The
#'   default value is NA, which means no records are excluded on the basis of
#'   the from_date.
#' @param to_date A string or Date representing a date. If a string is used
#'   it should specify the date in ISO 8601 date format e.g. 2000-12-31'. The
#'   default value is NA, which means no records are excluded on the basis
#'   of the to_date.
#' @return  A tibble with the same structure as the input df containing
#'   the rows that meet the filtering criteria.
#' @keywords internal

filter_dates <- function(
    df,
    start_col,
    end_col,
    from_date = NA,
    to_date = NA) {

    # Check the start and end columns exist
    if (! start_col %in% colnames(df)) {
        stop(missing_column_error(start_col))
    }

    if (! end_col %in% colnames(df)) {
        stop(missing_column_error(end_col))
    }

    # Check the dataframe has rows
    if (nrow(df) == 0) return(df)

    # Check there are dates to filter
    if (is.na(from_date) && is.na(to_date)) return(df)

    # Handle from and to dates
    from_date <- handle_date(from_date)
    to_date <- handle_date(to_date)

    # Check from date is before to date
    if ((!is.na(from_date)) && (!is.na(to_date)) && (from_date > to_date)) {
        stop("to_date is before from_date")
    }

    # Set default values
    from_after_end <- FALSE
    to_before_start <- FALSE

    # Get matching rows
    if (!is.na(from_date)) {
        from_after_end <- purrr::map_lgl(df[[end_col]], function(d) {
            ifelse(is.na(d), FALSE, from_date > d)
        })
    }

    if (!is.na(to_date)) {
        to_before_start <- purrr::map_lgl(df[[start_col]], function(d) {
            ifelse(is.na(d), FALSE, to_date < d)
        })
    }

    df[!(from_after_end | to_before_start), ]
}

#' Take a date which may be a string or a date and returns a date.
#'
#' \code{handle_date} takes a date which may be a Date or an ISO 8601 date
#' string, checks it is valid, and returns the date as a Date. NA values are
#' returned unmodified. This function raises an error if it is unable to
#' handle the date.
#'
#' @keywords internal

handle_date <- function(d) {
    if (is.na(d)) {
        return(d)
    } else if (class(d) == "Date") {
        return(d)
    } else if(class(d) == "character") {
        return(parse_date(d))
    } else {
        stop(date_format_error(d))
    }
}

# Filter memberships ----------------------------------------------------------

#' Filter a dataframe of memberships to include only the rows whose period
#' of membership intersects with those in another dataframe of memberships
#'
#' \code{filter_memberships} is a function to find all memberships in one
#' dataframe that intersect with those in another data frame for each person,
#' or other entity. This function lets you find things like all committee
#' memberships for Commons Members during the period they have served as an MP,
#' or all government roles held by Members of the House Lords while they have
#' served in the Lords.
#'
#' @param tm A tibble containing the target memberships. These are the
#'   memberships to be filtered.
#' @param fm A tibble containing the filter memberships. These are the
#'   memberships that are used to filter the target memberships.
#' @param tm_id_col The name of the column in the target memberships that
#'   contains the target membership id.
#' @param tm_start_col The name of the column in target memberships that
#'   contains the start date for the membership.
#' @param tm_end_col The name of the column in target memberships that contains
#'   the end date for the membership.
#' @param fm_start_col The name of the column in filter memberships that
#'   contains the start date for the membership.
#' @param fm_end_col The name of the column in filter memberships that contains
#'   the end date for the membership.
#' @param join_col The name of the column in both the target and filter
#'   memberships that contains the id of the entity that is common to both
#'   tables. Where the entity is a person this will be the person id.
#' @return  A tibble with the same structure as the input tm containing the
#'   rows that meet the filtering criteria.
#' @keywords internal

filter_memberships <- function(
    tm,
    fm,
    tm_id_col,
    tm_start_col,
    tm_end_col,
    fm_start_col,
    fm_end_col,
    join_col) {

    # Check the target dataframe has rows
    if (nrow(tm) == 0) return(tm)

    # Check the columns exist in each dataframe
    if (!tm_id_col %in% colnames(tm)) {
        stop(missing_column_error(tm_id_col))
    }

    if (! tm_start_col %in% colnames(tm)) {
        stop(missing_column_error(tm_start_col))
    }

    if (! tm_end_col %in% colnames(tm)) {
        stop(missing_column_error(tm_end_col))
    }

    if (! fm_start_col %in% colnames(fm)) {
        stop(missing_column_error(fm_start_col))
    }

    if (! fm_end_col %in% colnames(fm)) {
        stop(missing_column_error(fm_end_col))
    }

    if (! join_col %in% colnames(fm)) {
        stop(missing_column_error(join_col))
    }

    # Create abstract copies of tm and fm
    tma <- tm %>% dplyr::select(
        !! join_col,
        !! tm_id_col,
        !! tm_start_col,
        !! tm_end_col)
    colnames(tma) <- c(
        "join_col",
        "tm_id_col",
        "tm_start_col",
        "tm_end_col")

    fma <- fm %>% dplyr::select(
        !! join_col,
        !! fm_start_col,
        !! fm_end_col)
    colnames(fma) <- c(
        "join_col",
        "fm_start_col",
        "fm_end_col")

    # Join the target memberships with the filter membership dates on join_col
    tm_fm <- dplyr::left_join(
        tma,
        fma,
        by = "join_col")

    # Function to test if a target membership and filter membership intersect
    in_fm_func <- function(row) {

        # Handle dates
        tm_start_date <- row["tm_start_col"]
        tm_end_date <- row["tm_end_col"]
        fm_start_date <- row["fm_start_col"]
        fm_end_date <- row["fm_end_col"]
        tm_start_after_fm_end <- FALSE
        tm_end_before_fm_start <- FALSE

        # Get the match status of the rows
        if (!is.na(tm_start_date)) {
            tm_start_after_fm_end <- ifelse(
                is.na(fm_end_date), FALSE, tm_start_date > fm_end_date)
        }

        if (!is.na(tm_end_date)) {
            tm_end_before_fm_start <- ifelse(
                is.na(fm_start_date), FALSE, tm_end_date < fm_start_date)
        }

        # Return if the memberships intersect
        !(tm_start_after_fm_end || tm_end_before_fm_start)
    }

    # Apply the function to each combination of target and filter memberships
    tm_fm["in_membership"] <- apply(tm_fm, 1, in_fm_func)

    match_status <- tm_fm %>%
        dplyr::group_by(.data$tm_id_col) %>%
        dplyr::summarise(in_membership = any(.data$in_membership))

    # Restore the actual target membership id column name for joining
    colnames(match_status) <- c(tm_id_col, "in_membership")

    # Join the match status with the original target memberships data
    tm_fm_status <- dplyr::left_join(
        tm,
        match_status,
        by = tm_id_col)

    # Return the target memberships after filtering
    tm_fm_status %>%
        #dplyr::filter(.data$in_membership) %>%
        dplyr::filter(in_membership) %>%
        dplyr::select(-.data$in_membership) %>%
        dplyr::ungroup()
}
houseofcommonslibrary/clmnis documentation built on Aug. 17, 2024, 9:31 p.m.