R/dashboard.R

Defines functions check_threshold est_part est_recruit est_churn scaleup_part scaleup_recruit format_result

Documented in check_threshold est_churn est_part est_recruit format_result scaleup_part scaleup_recruit

# functions for building dashboard summary data

#' Internal Function: Perform an action (e.g., warning, error) to flag records
#' 
#' The action is only triggered if any values in df[[test_variable]] exceed
#' test_threshold. Intended for use in calculating dashboard metrics.
#'
#' @param df data frame: table containing statistic to check
#' @param test_threshold numeric: if exceeded, while produce warning
#' @param test_variable character: Name of variable in df that contains
#' the test statistic
#' @param action function: function call to perform if threshold is exceeded
#' (intended to be \code{\link[base]{warning}} or \code{\link[base]{stop}}).
#' @param msg character: message to be printed in action
#' @family dashboard functions
#' @keywords internal
#' @export
#' @examples
#' library(dplyr)
#' 
#' # produce warnings
#' x <- data.frame(tot = "All", year = 2008:2018, part = rnorm(11, 1000, sd = 100))
#' x <- mutate(x, pct_change = (part - lag(part)) / lag(part) * 100)
#' check_threshold(x, 5)
#' 
#' # this will produce an error
#' # check_threshold(x, 5, action = function(...) stop(..., call. = FALSE))
check_threshold <- function(
    df, test_threshold, test_variable = "pct_change", 
    action = function(...) warning(..., call. = FALSE),
    msg = paste("Threshold of", test_threshold, "for", test_variable, "exceeded:")
) {
    flagged <- filter(df, abs(.data[[test_variable]]) > test_threshold) %>%
        data.frame()
    if (nrow(flagged) > 0) {
        action(msg, "\n", paste(capture.output(print(flagged)), collapse = "\n"), "\n")
    } 
}

#' Estimate participants by year from license history
#' 
#' This function requires a correctly formated history table (see \code{\link{history}}).
#' It produces a simple count of records per year, optionally by segment, 
#' & runs a validation test: pct change per year.
#' 
#' @param history data frame: input license history table
#' @param segment character: defaults to "tot", which indicates no segmentation.
#' Alternatively specifiy other license history variables ("res", "sex", etc.)
#' @param test_threshold numeric: threshold in whole number percentage points 
#' for pct change per year. A warning will be printed if the absolute value
#' of the change for any year exceeds the threshold.
#' @param show_test_stat logical: If TRUE, the output table will include
#' a variable holding the test statistic for each row.
#' @param suppress_warning logical: If TRUE, no test warning will be displayed 
#' (even if threshold is exceeded). Test statistics can still be included by 
#' setting show_test_stat = TRUE.
#' @param outvar character: name of variable that stores metric
#' @return Returns a data frame with 3 variables (segment, "year", outvar), and
#' optionally with 2 extra variables ("change", "pct_change") if show_test_stat = TRUE
#' @family dashboard functions
#' @seealso Salic Function Reference: \code{\link{salic}}
#' @export
#' @examples
#' library(dplyr)
#' data(history)
#' history <- history %>%
#'     label_categories() %>%
#'     recode_agecat() %>%
#'     filter(!agecat %in% c("0-17", "65+"))
#'     
#' # participants
#' est_part(history)
#' 
#' # by segment
#' # produces a warning
#' est_part(history, "agecat") 
#' 
#' # suppress warning
#' x <- est_part(history, "agecat", test_threshold = 25)
#' 
#' # new recruits
#' history_new <- filter(history, !is.na(R3), R3 == "Recruit")
#' est_recruit(history_new)
#' 
#' # apply over multiple segments 
#' segs <- c("tot", "res", "sex", "agecat")
#' part <- sapply(segs, function(x) est_part(history, x), simplify = FALSE)
#' 
#' # specify test thesholds by segment to suppress warnings
#' tests <- c(tot = 20, res = 45, sex = 30, agecat = 40)
#' part <- sapply(segs, function(x) est_part(history, x, tests[x]), simplify = FALSE)
est_part <- function(
    history, segment = "tot", test_threshold = 20, show_test_stat = FALSE,
    suppress_warning = FALSE, outvar = "participants"
) {
    if (segment == "tot") {
        history <- mutate(history, tot = "All") # for group_by()
    } else {
        # need to drop records where segment value is missing
        history <- filter(history, !is.na(!! as.name(segment)))
    }
    out <- history %>%
        group_by_at(c(segment, "year")) %>%
        summarise(!! outvar := n()) %>%
        mutate(
            change = .data[[outvar]] - lag(.data[[outvar]]),
            pct_change = .data$change / lag(.data[[outvar]]) * 100
        ) %>%
        ungroup()
    if (!suppress_warning) check_threshold(out, test_threshold)
    if (!show_test_stat) out <- select(out, -.data$change, -.data$pct_change) 
    out
}

# convenience function for recruit participation
#' @rdname est_part
#' @export
est_recruit <- function(
    history, segment = "tot", test_threshold = 35, show_test_stat = FALSE,
    suppress_warning = FALSE, outvar = "recruits"
) {
    est_part(history, segment, test_threshold, show_test_stat, 
             suppress_warning, outvar)
}

#' Estimate churn by year from license history
#' 
#' This function requires a correctly formated history table (see \code{\link{history}}).
#' It runs a mean of the lapse value (per year), optionally by segment (and also shifts 
#' year forward by 1 so that churn in current year reflects lapse pct from last year). 
#' It also runs a validation test: pct change per year.
#' @inheritParams est_part
#' @family dashboard functions
#' @seealso Salic Function Reference: \code{\link{salic}}
#' @export
#' @examples
#' library(dplyr)
#' data(history)
#' history <- history %>%
#'     label_categories() %>%
#'     recode_agecat() %>%
#'     filter(!agecat %in% c("0-17", "65+"))
#' est_churn(history)
#' 
#' # apply across all segments
#' segs <- c("tot", "res", "sex", "agecat")
#' sapply(segs, function(x) est_churn(history, x), simplify = FALSE)
est_churn <- function(
    history, segment = "tot", test_threshold = 30, show_test_stat = FALSE,
    suppress_warning = FALSE, outvar = "churn"
) {
    if (segment == "tot") {
        history <- mutate(history, tot = "All")
    } else {
        history <- filter(history, !is.na(!! as.name(segment)))
    }
    # churn is simply lapse % per year
    out <- history %>%
        group_by_at(c(segment, "year")) %>%
        summarise(!! outvar := mean(lapse)) %>%
        mutate(
            change = .data[[outvar]] - lag(.data[[outvar]]),
            pct_change = .data$change / lag(.data[[outvar]]) * 100
        ) %>%
        ungroup()
    
    # shifting one year forward so current year always has a value
    # hence churn = % of last years buyers who didn't renew this year
    lastyr <- max(out$year)
    out <- mutate(out, year = year + 1) %>%
        filter(year != lastyr + 1)
    if (!suppress_warning) check_threshold(out, test_threshold)
    if (!show_test_stat) out <- select(out, -.data$change, -.data$pct_change) 
    ungroup(out)
}

#' Scale segmented participation counts to total (if needed)
#' 
#' This scaling accounts for missing values in segments, scaling up all counts
#' to ensure the sum matches the total count. It expects 2 tables as input, both
#' produced by \code{\link{est_part}}. If no scaling is needed (i.e., sum(part_segment$part)
#' == sum(part_total$part)) the function will simply return the input df.
#' 
#' @param part_segment data frame: A segmented  participation table
#' produced by \code{\link{est_part}} (e.g., with segment argument set to "res")
#' @param part_total data frame: An overall participation table produced by
#' \code{\link{est_part}}
#' @param test_threshold numeric: threshold in whole number percentage points 
#' which defines the upper limit of acceptable proportion of missing values for 
#' the segment. The function will stop with an error if this threshold
#' is exceeded. Relaxing the threshold can allow the check to pass, but use this
#' with caution since a high percentage of missing values might suggests that 
#' the breakouts aren't representative (e.g., if not missing at random).
#' @inheritParams est_part
#' @family dashboard functions
#' @seealso Salic Function Reference: \code{\link{salic}}
#' @export
#' @examples
#' library(dplyr)
#' data(history)
#' history <- label_categories(history)
#' 
#' # demonstrate the need for scaling
#' part_total <- est_part(history)
#' part_segment <- est_part(history, "sex", test_threshold = 40)
#' left_join(
#'     select(part_total, year, part_tot = participants),
#'     group_by(part_segment, year) %>% summarise(part_seg = sum(participants)),
#' )
#' 
#' # perform scaling
#' part_segment <- scaleup_part(part_segment, part_total)
#' left_join(
#'     select(part_total, year, part_tot = participants),
#'     group_by(part_segment, year) %>% summarise(part_seg = sum(participants)),
#' )
#' 
#' # new recruits - unscaled
#' history_new <- filter(history, R3 == "Recruit")
#' part_total <- est_recruit(history_new, "tot")
#' part_segment <- est_recruit(history_new, "sex")
#' part_segment
#' 
#' # new recruits - scaled
#' scaleup_recruit(part_segment, part_total)
scaleup_part <- function(
    part_segment, part_total, test_threshold = 10, show_test_stat = FALSE,
    outvar = "participants"
) {
    if (!outvar %in% colnames(part_segment) | !outvar %in% colnames(part_total)) {
        stop("Missing '", outvar, "' from at least one of the input tables", call. = FALSE)
    }
    if (sum(part_segment[[outvar]]) == sum(part_total[[outvar]])) {
        return(part_segment) # scaling not needed
    }
    if (nrow(part_total) > nrow(part_segment)) {
        warning("Argument part_segment has fewer rows than part_total.\n",
                "Maybe you mixed up the arguments?", call. = FALSE)
    }
    # compute scale factor
    part_total2 <- part_total %>%
        semi_join(part_segment, by = "year") %>%
        group_by(year) %>%
        summarise(total = sum(.data[[outvar]]))
    part_segment2 <- part_segment %>%
        group_by(year) %>%
        summarise(segment = sum(.data[[outvar]]))
    compare <- part_segment2 %>%
        left_join(part_total2, by = "year") %>%
        mutate( 
            total_na = .data$total - .data$segment,
            pct_na = .data$total_na / .data$total * 100,  
            scale_factor = .data$total / .data$segment 
        )
    check_threshold(
        compare, test_threshold, "pct_na",
        action = function(...) stop(..., call. = FALSE),
        msg = paste0("Threshold of ", test_threshold, 
                     " for pct_na exceeded for ", names(part_segment)[1], ":")
    )
    # scale to match the total
    compare <- select(compare, .data$year, .data$pct_na, .data$scale_factor)
    out <- part_segment %>%
        left_join(compare, by = "year") %>%
        mutate(!! outvar := as.integer(round(.data[[outvar]] * .data$scale_factor, 0))) %>%
        select(-.data$scale_factor)
    
    # a final check of the scaled total
    # TODO: might not need this if function is tested thoroughly, will leave for now
    # - i don't recall what problem this caught...so not sure what unit test to write
    diff <- abs(sum(out[[outvar]]) - sum(part_total2$total))
    if (diff > 50) { # allows for a small amount of rounding error
        warning("Something might have gone wrong in scaling since the segment sum of ",
                sum(out[[outvar]]), " is different than the total of ", 
                sum(part_total2$total))
    }
    if (!show_test_stat) out <- select(out, -.data$pct_na) 
    out
}

# convenience function for recruit scaleup
#' @rdname scaleup_part
#' @export
scaleup_recruit <- function(
    part_segment, part_total, test_threshold = 10, show_test_stat = FALSE,
    outvar = "recruits"
) {
    scaleup_part(part_segment, part_total, test_threshold, 
                 show_test_stat, outvar)
}

#' Format estimated metrics for input to Dashboard
#' 
#' This function expects a data frame produced by an salic estimation function (
#' \code{\link{est_part}}, \code{\link{est_recruit}}, \code{\link{est_churn}}).
#' It returns a data frame with additional formatting that allows stacking all results
#' into a single table.
#' 
#' @param df data frame: Input table (3 variables) with estimated metrics
#' @param timeframe character: value to store in the 'timeframe' variable of 
#' the output (e.g, 'full-year', 'mid-year')
#' @param group character: value to sore in the 'group' variable of the 
#' output (e.g., 'all_sports', 'fish', 'hunt')
#' @param rename_input character: generic names for input variables as they
#' will appear in the output
#' @family dashboard functions
#' @seealso Salic Function Reference: \code{\link{salic}}
#' @export
#' @examples
#' library(dplyr)
#' data(metrics)
#' 
#' # format a table
#' metrics$participants$res
#' 
#' x <- format_result(metrics$participants$res, "full-year", "all_sports")
#' x
#' 
#' # combine formatted tables
#' y <- format_result(metrics$participants$tot, "full-year", "all_sports")
#' bind_rows(y, x)
#' 
#' # apply formatting across all segments
#' x <- lapply(metrics$participants, function(x) format_result(x, "full-year", "sports"))
#' bind_rows(x)
#'    
#' # apply across all metrics & segments
#' bind_rows(
#'     lapply(metrics$participants, function(x) format_result(x, "full-year", "sports")),
#'     lapply(metrics$recruits, function(x) format_result(x, "full-year", "sports")),
#'     lapply(metrics$churn, function(x) format_result(x, "full-year", "sports"))
#' )
format_result <- function(
    df, timeframe, group, rename_input = c("category", "year", "value")
) {
    # expecting exactly 3 columns in the input data frame
    out <- df
    names(out) <- rename_input
    
    # stored as input variable names >> placed in output variable values
    segment <- colnames(df)[1]
    metric <- colnames(df)[3]
    
    # adding variables to represent structure in a single table
    out$segment <- segment
    out$metric <- metric
    out$timeframe <- timeframe
    out$group <- group
    out$category <- as.character(out$category)
    
    # modify segment names
    out <- out %>% dplyr::mutate(
        segment = dplyr::case_when(
            segment == "tot" ~ "All",
            segment == "res" ~ "Residency",
            segment == "sex" ~ "Gender",
            segment == "agecat" ~ "Age",
            segment == "county" ~ "County",
            TRUE ~ segment
        )
    )
    out %>% dplyr::select(
        .data$timeframe, .data$group, .data$segment, .data$year,  
        .data$category, .data$metric, .data$value
    )
}
southwick-associates/salic documentation built on Nov. 5, 2019, 9:13 a.m.