R/population.R

Defines functions multiple_sevar discover_outlying_rates plot_rates pop_rate plot_history compare_sevar plot_sevar extract_se yearly_summary

Documented in compare_sevar discover_outlying_rates extract_se multiple_sevar plot_history plot_rates plot_sevar pop_rate yearly_summary

#' Make a yearly summary table of a land use variable
#'
#' @param df A \code{data_frame} of zones, such as is stored in the sqlite
#'   database.
#' @param group The variable to use for grouping the zones (for instance,
#'   "County").
#'
#' @param var The variable to sum (for instance, "POPULATION")
#'
#' @return A \code{data_frame} with the grouping variable in each row, and the
#' value of the variable at each time stage going across the columns.
#'
#' @export
yearly_summary <- function(df, group, var){
  df %>%
    dplyr::group_by_(group, "year") %>%
    dplyr::mutate_("var" = var) %>%
    dplyr::summarize(var = sum(var)) %>%
    dplyr::collect(n=Inf) %>%
    tidyr::spread(year, var, fill = NA)
}

#' Extract SE data from DB
#'
#'
#' @param db The scenario sqlite database.
#' @param color_var Field to color by: either "MPO" or "COUNTY".
#' @param color_levels A character vector of the color variable specifiying
#'   which levels to include.
#' @param controls Plot against the control totals. Defaults to TRUE, cannot
#'   currently run with FALSE.
#' @param index Whether to show the variables as indexed against the base year.
#'
#' @export
#' @return a data frame
extract_se <- function(db, color_var = NULL,
                       color_levels = NULL, controls = TRUE, index = FALSE){

  if(is.null(color_var)){color_var = "COUNTY"}

  if(controls){ # if controls are asked for, then make sure that they are available
    if(color_var != "COUNTY"){
      warning("Controls only available for county-level population forecasts.")
      controls <- FALSE
    }
  }

  # county plot with controls
  if(color_var == "COUNTY"){
    # Pull se data
    df <- dplyr::tbl(db, "AZONE") %>%
      dplyr::select(AZONE, POPULATION, EMPLOYMENT, TOTALHHS, TSTEP) %>%
      # join information for county and state
      dplyr::left_join(
        dplyr::tbl(db, "ALLZONES") %>%
          dplyr::select(AZONE = Azone, COUNTY = COUNTY, state = STATE)
      ) %>%

      # dplyr::summarize to the county level
      dplyr::group_by(COUNTY, state, TSTEP) %>%
      dplyr::summarize(
        population = sum(POPULATION, na.rm=TRUE),
        employment = sum(EMPLOYMENT, na.rm=TRUE),
        totalhh = sum(TOTALHHS, na.rm=TRUE)
      ) %>%
      dplyr::mutate(year = as.numeric(TSTEP) + 1990) %>%
      dplyr::ungroup() %>% dplyr::collect(n=Inf)

    # if no levels specified, then keep all
    if(!is.null(color_levels)){
      df <- dplyr::filter(df, COUNTY %in% color_levels)
    }

    df <- df %>%
      dplyr::select(color_var = COUNTY, year, population, employment) %>%
      tidyr::gather(var, y, population:employment) %>%
      dplyr::mutate(data = "SWIM")

    # add COUNTY controls
    if(controls){
      ct <- county_controls %>%
        dplyr::rename(color_var = county) %>%
        dplyr::filter(color_var %in% df$color_var) %>%
        dplyr::filter(year > 2005) %>%
        dplyr::select(color_var, year, var, y) %>%
        dplyr::mutate(data = "OEA Forecast")

      df <- rbind(df, ct)
    }


  } else {

    grouping <- dplyr::tbl(db, "BZONE") %>%
      dplyr::select_("BZONE", "color_var" = color_var)

    # get levels of facet_var if none given
    if(is.null(color_levels)){
      color_levels <- grouping %>% dplyr::group_by(color_var) %>% dplyr::collect(n=Inf) %>%
        dplyr::slice(1) %>% .$color_var

      color_levels <- color_levels[which(color_levels != "EXTSTA")]
    }

    # Pull se data
    df <- dplyr::tbl(db, "AZONE") %>%
      dplyr::select(AZONE, POPULATION, EMPLOYMENT, TOTALHHS, TSTEP) %>%
      # join information for county and state
      dplyr::left_join(
        dplyr::tbl(db, "ALLZONES") %>% rename(AZONE = Azone), by = "AZONE") %>%
      dplyr::left_join(grouping, by = "BZONE") %>%
      dplyr::filter(color_var %in% color_levels) %>%

      # dplyr::summarize to the MPO level
      dplyr::group_by(color_var, TSTEP) %>%
      dplyr::summarize(
        population = sum(POPULATION, na.rm=TRUE),
        employment = sum(EMPLOYMENT, na.rm=TRUE),
        totalhh = sum(TOTALHHS, na.rm=TRUE)
      ) %>%
      dplyr::mutate(year = as.numeric(TSTEP) + 1990) %>%
      dplyr::ungroup() %>% dplyr::collect(n=Inf) %>%

      dplyr::select(color_var, year, population, employment) %>%
      tidyr::gather(var, y, population:employment) %>%
      dplyr::mutate(data = "SWIM")
  }

  if(index){
    df <- df %>%
      dplyr::group_by_("color_var", "var", "data") %>%
      dplyr::mutate(y = calc_index(y))
  }

  return(df)

}

#' Make a plot of population and employment
#'
#' @inheritDotParams extract_se
#'
#' @return A \code{ggplot2} plot object showing the modeled change in employment
#'   and population over time.
#'
#' @export
plot_sevar <- function(...){

  df <- extract_se(...)
  dots <- list(...)

  # county plot with controls
  if(dots$color_var == "COUNTY"){

    p <- ggplot2::ggplot(df) +
      ggplot2::geom_line(ggplot2::aes_string(
        x = "year", y = "y", color = "color_var", lty = "data"))

    if(dots$controls){
      p <- p + ggplot2::scale_linetype_manual("Data", values = c("dashed", "solid"))
    }

  } else { # MPO plot without controls
    p <- ggplot2::ggplot(df) +
      geom_line(ggplot2::aes(x = year, y = y, color = color_var))
  }


  # ggplot2::theme, etc
  p +
    ggplot2::scale_color_discrete(dots$color_var) +
    ggplot2::facet_grid(. ~ var, scale = "free_y") +
    ggplot2::xlab("Year") + ggplot2::ylab(ifelse(dots$index, "Index Relative to Base", "Count")) +
    ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30))
}


#' Compare population and employment
#'
#' @param db1 The swim database for the "Reference" scenario.
#' @param db2 The swim database for the "Current" scenario.
#' @inheritDotParams extract_se color_var:index
#'
#' @return A \code{ggplot2} plot object showing the modeled change in employment
#'   and population over time.
#'
#' @export
compare_sevar <- function(db1, db2, ...){

  dots <- list(...)

  seref <- extract_se(db = db1, ..., controls = FALSE) %>% dplyr::rename(ref = y)
  secom <- extract_se(db = db2, ..., controls = FALSE) %>% dplyr::rename(com = y)

  df <- dplyr::left_join(seref, secom) %>%
    dplyr::mutate(diff = (com - ref) / ref * 100)

  ggplot2::ggplot(df,
         ggplot2::aes_string(x = "year", y = "diff", color = "var")) +
    ggplot2::geom_path() +
    ggplot2::facet_wrap(~color_var) +
    ggplot2::xlab("Year") + ggplot2::ylab("Percent difference (current - reference).") +
    ggplot2::scale_color_discrete(dots$color_var) +
    ggplot2::theme_bw() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30))
}

#' Plot population with controls and historical data.
#'
#' @param db Swim database.
#' @param counties If not null (default), a character vector of counties to plot.
#'
#' @return a ggplot2 object
#' @export
plot_history <- function(db, counties = NULL) {

  df <- extract_se(db, color_var = "COUNTY", color_levels = counties, controls = TRUE) %>%
    dplyr::filter(var == "population") %>%
    dplyr::select(-var) %>%
    dplyr::bind_rows(., historical_pop %>% dplyr::mutate(data = "Census")) %>%
    dplyr::filter(color_var %in% counties)

  ggplot2::ggplot(df, ggplot2::aes(x = year, y = y, color = color_var, lty = data)) +
    ggplot2::geom_path() +
    ggplot2::xlab("Year") + ggplot2::ylab("Population") +
    ggplot2::scale_linetype_manual("Data", values = c("solid", "dotted", "longdash")) +
    ggplot2::scale_color_discrete("County") +
    ggplot2::theme_bw() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30))
}

#' Determine annualized population growth rates.
#'
#' This function calculates the annualized growth rate between Census years and
#' between SWIM population synthesizer runs.
#'
#' @param db Swim database.
#' @param counties If not null (default), a character vector of counties to plot.
#'
#' @export
pop_rate <- function(db, counties = NULL) {


  # get the population in every year
  df <- extract_se(db, "COUNTY", counties, controls = TRUE) %>%
    dplyr::filter(var == "population") %>%
    dplyr::filter(data == "SWIM") %>%
    dplyr::select(-var) %>%
    rbind_list(., historical_pop %>% dplyr::mutate(data = "Census")) %>%
    dplyr::arrange(year)

  if(!is.null(counties)){
    # if a list of counties is given, dplyr::filter to that
    df <- df %>% dplyr::filter(county %in% counties)
  } else {
    # only show counties in oregon
    df <- df %>%
      dplyr::filter(
        county %in% unique(zones_data %>% dplyr::filter(STATE == "OR") %>% .$COUNTY)
      )
  }

  # function to interpolate between values
  interpolate <- function(x, year){
    zoo::na.approx(x, year)
  }

  #expand to include all years
  df <- dplyr::tbl_df(expand.grid(
    year = min(df$year):max(df$year),
    county = unique(df$county)
  )) %>%
    dplyr::mutate(
      data = ifelse(year < 2010, "Census", "SWIM")
    ) %>%
    dplyr::left_join(df) %>%

    # Interpolate population and calculate annual growth rate
    dplyr::group_by(county) %>%
    dplyr::mutate(
      pop = interpolate(y, year),
      rate = (lead(pop) - pop) / pop * 100
    )

  return(df)

}

#' Plot annualized growth rates
#'
#' This function returns a box plot comparing the annualized population growth
#' rates in a set of counties. This can help identify problems that the model
#' may be having relative to historical trends.
#'
#' @param db the Swim database.
#' @param counties A character vector of counties to display.
#'
#' @export
plot_rates <- function(db, counties) {

  rates <- pop_rate(db, counties)

  ggplot2::ggplot(rates, ggplot2::aes(x = county, y = rate, color = data)) +
    geom_boxplot() +
    geom_point(position = "jitter") +
    ggplot2::xlab("County") + ggplot2::ylab("Annualized Growth Rates") +
    ggplot2::theme_bw() + ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30))

}

#' Discover outlying counties
#'
#' A major question we face is identifying whether the modeled population growth
#' rate in a region is "reasonable." To date reasonableness has been subjective,
#' but this function provides quantitative methodologies to calculate this.
#'
#' @param db The swim database
#' @param counties A character vector of counties to calculate.
#' @param method The method by which unreasonable counties are identified. See
#'   "Details".
#'
#' @return A data frame with information on each county's conformity with
#'   historical trends.
#'
#' @details If \code{method = "count"} (default), the number of years for which
#'   the annualized population growth rate lies outside of the historical
#'   confidence interval. If \code{method = "pval"}, a data frame of swim scenario
#'   years with with the $p$-value of the annualized growth rate on the
#'   distribution defined by historical data.
#'
#' @export
#' @seealso \code{\link{pop_rate}}
#'
discover_outlying_rates <- function(db, counties = NULL,
                                    method = c("count", "pval")){


  # get annualized population growth rate in each year, and stash in column
  rates <- pop_rate(db, counties) %>%
    dplyr::group_by(county) %>%
    dplyr::mutate(
      census = ifelse(data == "Census", rate, NA),
      swim   = ifelse(data == "SWIM", rate, NA)
    )

  if(method == "pval"){

    get_pval <- function(x, y){
      # get pvalue for each model year given historical rate distribution
      stats::pnorm(y, mean(x, na.rm = TRUE), stats::sd(x, na.rm = TRUE))
    }

    r <- rates %>%
      dplyr::mutate(
        pval = get_pval(census, swim)
      ) %>%
      dplyr::filter(!is.na(y)) %>%
      dplyr::filter(data == "SWIM") %>%
      dplyr::select(county, year, data, pop, pval)

  } else if(method == "count"){

    count_insideci <- function(x, y) {
      y <- y[!is.na(y)]

      # what percent of y falls inside confidence interval of f?
      low <- mean(x, na.rm = TRUE) - 1.96 * stats::sd(x, na.rm = TRUE)
      hi  <- mean(x, na.rm = TRUE) + 1.96 * stats::sd(x, na.rm = TRUE)

      inside <- ifelse(y >= low & y <= hi, TRUE, FALSE)

      return(sum(inside) / length(y))

    }

    r <- rates %>%
      dplyr::group_by(county) %>%
      dplyr::summarize(
        inside_ci = count_insideci(census, swim)
      )
  }

  return(r)

}


#' Compare Population or Employment across multiple scenarios.
#'
#' Scenarios (represented by elements of \code{dbset}) are represented as colors, while \code{facet_var} specifies the
#' variable to use for facets.
#'
#' @param dbset A list of connections to SWIM databases.
#' @param db_names A character vector naming the scenarios.
#' @param variable One of \code{c("population", "employment")} defining which
#'   socioeconomic variable to include in
#' @param facet_var Field to facet by: either "MPO" or "COUNTY".
#' @param facet_levels A character vector of the facet variable specifying
#'   which levels to include.

#' @inherit extract_se params
#'
#' @return a ggplot2 object.
#'
#' @export
multiple_sevar <- function(dbset, db_names,
                           variable = c("population", "employment"),
                           facet_var = NULL, facet_levels=NULL,
                           controls = TRUE, index = FALSE) {

  # get the population table for every scenario.
  names(dbset) <- db_names
  df <- bind_rows(
    lapply(seq_along(dbset), function(i)
      extract_se(dbset[[i]], color_var = facet_var, color_levels=facet_levels) %>%
        dplyr::mutate(scenario = names(dbset)[[i]]) %>%
        dplyr::filter(var == variable)
    )
  ) %>%
    dplyr::mutate_("facet_var" = names(.)[1])


  # add control data if desired
  if(controls){
    p <- ggplot2::ggplot(
      data = df %>%
        # only need control once
        dplyr::filter(data != "Control" | scenario == names(dbset)[1]) %>%
        dplyr::mutate(scenario = ifelse(data == "Control", "Control", scenario)),
      ggplot2::aes(x = year, y = y, color = scenario, lty = data)
    ) +
      ggplot2::scale_linetype_manual("source",
                            values = c("dotted", rep("solid", length(dbset))))
  } else {
    p <- ggplot2::ggplot(
      data = df %>%
        dplyr::filter(data != "Control"),
      ggplot2::aes(x = year, y = y, color = scenario)
    )

  }

  p <- p + ggplot2::geom_path() +
    ggplot2::facet_wrap(~ facet_var, scale = "free_y") +
    ggplot2::ylab(variable) + ggplot2::xlab("Year") +
    ggplot2::theme_bw() +
    ggplot2::theme(axis.text.x = ggplot2::element_text(angle = 30))


  return(p)

}
pbsag/swimr documentation built on Feb. 28, 2019, 10:01 p.m.