R/expected_diagnostic.R

Defines functions expected_diagnostic

Documented in expected_diagnostic

#' Diagnostic Plots for Model Fit
#' 
#' Check mean model fit via diagnostic figures of the model components
#' 
#' @param expected The output from `excess_counts` with `keep.components = TRUE`
#' @param start First day to show
#' @param end Last day to show
#' @param color Color for the expected curve
#' @param alpha alpha blending for points
#' 
#' @return A list with six ggplot objects: 
#' `population` is a time series plot of the population.
#' `seasonal` is a plot showing the estimated seasonal effect.
#' `trend` is a time series plot showing the estimated trend.
#' `weekday` is a plot of the estimated weekday effects if they were estimated.
#' `expected`is a time series plot of the expected values.
#' `residual` is a time series plot of the residuals.
#' 
#' @examples
#' 
#' library(dplyr)
#' library(lubridate)
#' library(ggplot2)
#' 
#' flu_season <- seq(make_date(2017, 12, 16), make_date(2018, 1, 16), by = "day")
#'
#' exclude_dates <- c(flu_season, seq(make_date(2020, 1, 1), today(), by = "day"))
#' 
#' res  <- cdc_state_counts %>%
#'  filter(state == "Massachusetts") %>%
#'  compute_expected(exclude = exclude_dates,
#'                   keep.components = TRUE)
#'                   
#' p <- expected_diagnostic(expected = res, alpha = 0.50)
#' 
#' p$population
#' p$seasonal
#' p$trend
#' p$expected
#' p$residual

#' @export
#' @import dplyr
#' @importFrom stats qnorm sd
#' @importFrom ggplot2 ggplot geom_ribbon ylab xlab geom_point coord_cartesian geom_line geom_errorbar

expected_diagnostic <- function(expected, 
                                start  = NULL,
                                end    = NULL,
                                color  = "#D22B2B",
                                alpha  = 0.50){
  
  # -- Check that expected is a `compute_expected` object
  if(!"compute_expected" %in% class(expected)) stop("The expected argument needs to be the output of the computed_expected function.")
  
  # -- Check if keep.components == TRUE
  if(!"keep.components" %in% names(attributes(expected))) stop("The components were not found. Run the computed_expected function with keep.components = TRUE")
  
  # -- Check if the data frequency is daily or weekly
  if(!attr(expected$counts, "frequency") %in% c(365, 52, 12)) stop("This function assumes monthly, weekly or daily data. This dataset has ", attr(expected$counts, "frequency"), " counts per year.")
  
  # -- We use ggplot for the figures
  requireNamespace("ggplot2")
  
  # -- If NULL, set start and end dates
  if(is.null(start)) start <- min(expected$counts$date)
  if(is.null(end))   end   <- max(expected$counts$date)
  
  # -- Extracting data from the `expected` argument
  dat <- with(expected$counts,
              tibble(date, 
                     outcome,
                     expected,
                     log_expected_se,
                     population = round(population), 
                     excluded))
  
  # -- Average number of deaths in non-excluded dates
  avg <- mean(filter(dat, excluded == FALSE)$outcome)
  
  # -- Seasonal data: ensuring dimensions match 
  if(attr(expected$counts, "frequency") %in% 365) {
    seasonal_dat <- dat %>%
      filter(excluded == FALSE) %>%
      group_by(yday(date)) %>%
      summarize(avg_outcome = mean(outcome)) %>%
      ungroup() %>%
      setNames(c("t", "avg_outcome")) %>%
      filter(t != 366)
    
  } else if(attr(expected$counts, "frequency") %in% 52) {
    seasonal_dat <- dat %>%
      filter(excluded == FALSE) %>%
      group_by(week(date)) %>%
      summarize(avg_outcome = mean(outcome)) %>%
      ungroup() %>%
      setNames(c("t", "avg_outcome")) %>%
      filter(t != 53)
    
  } else if(attr(expected$counts, "frequency") %in% 12) {
    seasonal_dat <- dat %>%
      filter(excluded == FALSE) %>%
      group_by(month(date)) %>%
      summarize(avg_outcome = mean(outcome)) %>%
      ungroup() %>%
      setNames(c("t", "avg_outcome"))
  }
  
  # -- Seasonal viz
  p_seasonal <- seasonal_dat %>%
    mutate(s = expected$seasonal$s) %>%
    ggplot(aes(t, avg_outcome)) +
    geom_point(alpha = alpha) +
    geom_line(aes(t, (s + 1) * avg), 
              color = color, 
              size  = 0.80) +
    scale_y_continuous(labels = scales::comma) +
    labs(x = "Days of the year",
         y = "Counts",
         title = "Seasonal Component")
  
  # -- Extracting the trend component
  trend <- tibble(date = expected$counts$date, trend = expected$trend) %>%
    left_join(select(dat, date, population), by = "date")
  
  # -- Getting yearly average death counts
  trend_obs <- dat %>%
    mutate(year = year(date)) %>%
    group_by(year) %>%
    summarize(outcome = mean(outcome)) %>%
    ungroup() %>%
    mutate(date = lubridate::make_date(year, 07, 01))
  
  # -- Long term trend viz
  p_trend <- ggplot() +
    geom_point(aes(date, outcome), 
               size  = 3,
               alpha = 0.80,
               data  = trend_obs) +
    geom_point(aes(date, outcome), 
               size  = 3,
               pch   = 1,
               data  = trend_obs) +
    geom_line(aes(date, trend * population / (1000 * attr(expected$counts, "frequency"))), 
              color = color, 
              size  = 0.80,
              data  = trend) +
    scale_y_continuous(labels = scales::comma) +
    labs(x = "Date",
         y = "Counts",
         title = "Long-term trend")
  
  # -- If weekday.effect = TRUE, then generate the figure
  if(attr(expected$counts, "weekday.effect")){
    
    # -- Wrangling weekday data
    weekday_dat <- dat %>%
      mutate(wday = wday(date)) %>%
      left_join(expected$weekday, by = c("wday" = "weekday")) %>%
      mutate(effect = avg * (1 + effect),
             wday   = wday(date, label = TRUE))
    
    # -- Wrangling average weekday data
    weekday_avg_dat <- dat %>%
      filter(excluded == FALSE) %>%
      mutate(wday = wday(date, label = TRUE)) %>%
      group_by(wday) %>%
      summarize(se      = sd(outcome) / sqrt(n()), 
                outcome = mean(outcome)) %>%
      ungroup()
    
    # -- Weekday viz
    p_weekday <- ggplot() +
      geom_errorbar(aes(wday, 
                        ymin = outcome - 2*se,
                        ymax = outcome + 2*se),
                    width = 0.01,
                    data  = weekday_avg_dat) +
      geom_point(aes(wday, outcome), 
                 shape = 18,
                 size  = 4,
                 color = "white",
                 data  = weekday_avg_dat) +
      geom_point(aes(wday, outcome), 
                 shape = 18,
                 size  = 4,
                 alpha = 0.80,
                 data  = weekday_avg_dat) +
      geom_point(aes(wday, outcome), 
                 pch  = 5,
                 size = 3,
                 data = weekday_avg_dat) +
      geom_point(aes(wday, effect),
                 color = color,
                 size  = 3,
                 data  = unique(select(weekday_dat, wday, effect))) +
      geom_point(aes(wday, effect),
                 pch  = 1,
                 size = 3,
                 data = unique(select(weekday_dat, wday, effect))) +
      labs(x = "Day of the week",
           y = "Counts",
           title = "Weekday Component")
  } else {
    p_weekday <- NULL
  }
  
  # -- Mean viz
  p_mean <- expected$counts %>%
    filter(date >= start & date <= end) %>%
    mutate(lwr = exp(log(expected) - 2 * log_expected_se),
           upr = exp(log(expected) + 2 * log_expected_se)) %>%
    ggplot(aes(date, outcome)) +
    geom_point(alpha = alpha) +
    geom_ribbon(aes(ymin = lwr,
                    ymax = upr),
                fill  = color, alpha = 0.5) +
    geom_line(aes(date, expected), 
              color = color,
              size  = 0.80) +
    scale_y_continuous(labels = scales::comma) +
    labs(x = "Date",
         y = "Counts",
         title = "Expected Mortality Counts")
  
  # -- Residual viz
  p_residual <- expected$counts %>%
    filter(date >= start & date <= end) %>%
    mutate(difference  = outcome - expected,
           expected_se = expected * log_expected_se,
           lwr         = -2 * expected_se,
           upr         = 2 * expected_se) %>%
    ggplot(aes(date, difference)) +
    geom_point(alpha = alpha) +
    geom_ribbon(aes(ymin = lwr,
                    ymax = upr),
                fill  = color, alpha = 0.5) +
    geom_line(aes(date, 0),
              color = color,
              size  = 0.80) +
    scale_y_continuous(labels = scales::comma) +
    labs(x = "Date",
         y = "Observed - expected",
         title = "Residual Plot")
  
  # -- Population plot
  p_population <- ggplot(aes(date, population), data = filter(dat, date >= start & date <= end)) +
    geom_line(size = 0.80) +
    scale_y_continuous(labels = scales::comma) +
    labs(y     = "Population",
         x     = "Date",
         title = "Estimated Population Size")
  
  # -- Add ons to poplation plot depending on the frequency
  if(attr(expected$counts, "frequency") %in% 365) {
    p_population <- p_population +
      geom_point(aes(date, population), 
                 size  = 2,
                 shape = 21,
                 color = "black",
                 fill  = color,
                 data  = filter(dat, month(date) == 7, day(date) == 1, date >= start, date <= end))
  } else if(attr(expected$counts, "frequency") %in% 52) {
    p_population <- p_population +
      geom_point(aes(date, population), 
                 size  = 2,
                 shape = 21,
                 color = "black",
                 fill  = color,
                 data  = filter(dat, week(date)==26, date >= start & date <= end))
  } else if(attr(expected$counts, "frequency") %in% 12) {
    p_population <- p_population +
      geom_point(aes(date, population), 
                 size  = 2,
                 shape = 21,
                 color = "black",
                 fill  = color,
                 data  = filter(dat, month(date)==7, date >= start & date <= end))
  }
  
  # -- To return
  return(list(population = p_population,
              seasonal   = p_seasonal,
              trend      = p_trend,
              weekday    = p_weekday,
              expected   = p_mean,
              residual   = p_residual))
}

Try the excessmort package in your browser

Any scripts or data that you put into this service are public.

excessmort documentation built on Oct. 11, 2021, 9:06 a.m.