whysospurious: Why So Spurious?

Documented in print.ggplot_suppressed tylervigen_plot2

#' Spurious Data Collated by Tyler Vigen
#'
#' A dataset containing information collated by Tyler Vigen from various
#' sources to showcase spurious relationships.
#'
#' @docType data
#'
#' @format A data frame with 15 rows and 31 variables:
#' \describe{
#'   \item{year}{year of given observation}
#'
#'   \item{science_spending}{US spending on science, space, and technology in billions of dollars
#'   (\href{https://www.census.gov/compendia/statab/cats/science_technology/expenditures_research_development.html}{U.S. Office of Management and Budget})}
#'
#'   \item{hanging_suicides}{suicides by hanging, strangulation, and suffocation
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{pool_fall_drownings}{number of people who drowned by falling into a pool
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{cage_films}{films Nicholas Cage appeared in
#'   (\href{https://www.imdb.com/name/nm0000115/}{Internet Movie Database})}
#'
#'   \item{cheese_percap}{per capita cheese consumption
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0217.xls}{Department of Agriculture})}
#'
#'   \item{bed_deaths}{number of people who died by becoming tangled in their bedsheets
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{maine_divorce_rate}{divorce rate in Maine
#'   (\href{https://www.census.gov/compendia/statab/cats/births_deaths_marriages_divorces/marriages_and_divorces.html}{National Vital Statistics Reports})}
#'
#'   \item{margarine_percap}{per capita consumption of margarine
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0217.xls}{U.S. Department of Agriculture})}
#'
#'   \item{miss_usa_age}{age of miss america
#'   (\href{https://en.wikipedia.org/wiki/Miss_America}{Wikipedia})}
#'
#'   \item{steam_murders}{number of murders by steam, hot vapours, or hot objects
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control and Prevention})}
#'
#'   \item{arcade_revenue}{total revenue generated by arcades in billions of dollars
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s1228.xls}{U.S. Census Bureau})}
#'
#'   \item{computer_science_doctorates}{computer science doctorates awarded in the US
#'   (\href{https://www.nsf.gov/statistics/infbrief/nsf11305/}{National Science Foundation})}
#'
#'   \item{noncom_space_launches}{worldwide non-commercial space launches
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0822.xls}{Federal Aviation Administration})}
#'
#'   \item{sociology_doctorates}{sociology doctorates awarded (US)
#'   (\href{https://www.nsf.gov/statistics/infbrief/nsf11305/}{National Science Foundation})}
#'
#'   \item{mozzarella_percap}{per capita consumtion of mozzarella cheese
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0217.xls}{U.S. Department of Agriculture})}
#'
#'   \item{civil_engineering_doctorates}{number of civil engineering doctorates awarded
#'   (\href{https://www.nsf.gov/statistics/infbrief/nsf11305/}{National Science Foundation})}
#'
#'   \item{fishing_drownings}{people who drowned after falling out of a fishing boat
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{kentucky_marriage_rate}{marriage rate in Kentucky
#'   (\href{https://www.census.gov/compendia/statab/cats/births_deaths_marriages_divorces/marriages_and_divorces.html}{National Vital Statistics Reports})}
#'
#'   \item{train_collision_deaths}{drivers killed in collisions with railway train
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{oil_imports_norway}{US crude oil imports from Norway
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0937.pdf}{Department of Energy})}
#'
#'   \item{chicken_percap}{per capita consumption of chicken
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0217.xls}{U.S. Department of Agriculture})}
#'
#'   \item{oil_imports_total}{total US crude oil imports
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0937.pdf}{Department of Energy})}
#'
#'   \item{pool_drownings}{number of people who drowned while in a swimming-pool
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{nuclear_power}{power generated by US nuclear power plants
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0937.pdf}{Department of Energy})}
#'
#'   \item{japanese_cars_sold}{Japanese passenger cars sold in the US
#'   (\href{https://www.census.gov/compendia/statab/2008/tables/08s1026.xls}{U.S. Bureau of Transportation Statistics})}
#'
#'   \item{motor_vehicle_suicides}{suicides by crashing of motor vehicle
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{spelling_bee_word_length}{letters in winning Word of Scripps National Spelling Bee
#'   (\href{https://www.spellingbee.com/champions-and-their-winning-words}{National Spelling Bee})}
#'
#'   \item{spider_deaths}{number of people killed by venomous spiders
#'   (\href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention})}
#'
#'   \item{math_doctorates}{math doctorates awarded
#'   (\href{https://www.nsf.gov/statistics/infbrief/nsf11305/}{National Science Foundation})}
#'
#'   \item{uranium}{uranium stored at US nuclear power plants
#'   (\href{https://www.census.gov/compendia/statab/2012/tables/12s0937.pdf}{Department of Engergy})}
#' }
#' @source \url{https://www.tylervigen.com/spurious-correlations}
"tylervigen"



#' Explicitly draw plot
#'
#' @description The ggplot2 code necessary to make plots in Tyler Vigen's style
#' require aggressive parameters for the splines which produce warning messages.
#' This custom print function suppresses these messages.
#'
#' @param x object of class ggplot_suppressed
#' @param ... other arguments not used by this method
#'
#' @return Invisibly returns the result of ggplot_build(), which is a list with components that contain the plot itself, the data, information about the scales, panels etc.
#' @export
print.ggplot_suppressed <- function(x, ...) {
  # current class vector
  xc <- class(x)
  # remove ggplot_suppressed
  attr(x, 'class') <- xc[-which(xc == "ggplot_suppressed")]
  suppressWarnings(print(x, ...))
}



#' Internal Color Palette
#'
#' @description Internal color palette for the package.
#'
#' @return list of named hex color codes
colors <- list("joker_green" = "#274000",
               "joker_purple" = "#54003E",
               "joker_red" = "#CC1005")



#' Tyler Vigen Plot 2
#'
#' @description Number of people who drowned by falling into a pool correlates with Films Nicolas Cage appeared in (r = 0.67).
#' @references
#'   \href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention}
#'
#'   \href{https://www.imdb.com/name/nm0000115/}{Internet Movie Database}
#' @keywords drowning, pool, cage
#' @export
tylervigen_plot2 <- function() {
  suppressWarnings(
    whysospurious::tylervigen %>%
      dplyr::select(year, cage_films, pool_fall_drownings) %>%
      dplyr::filter(complete.cases(.)) %>%
      dplyr::transmute(
        year = year,
        series_a = (cage_films - 0) / (6 - 0),
        series_b = (pool_fall_drownings - 80) / (140 - 80)
      ) %>%
      tidyr::pivot_longer(tidyr::starts_with("series")) %>%
      ggplot2::ggplot(ggplot2::aes(
        x = year, y = value, color = name
      )) +
      ggplot2::geom_point() +
      ggplot2::geom_smooth(
        method = "loess",
        formula = y ~ x,
        span = 0.4,
        se = FALSE
      ) +
      ggplot2::scale_x_continuous(
        ggplot2::element_blank(),
        breaks = seq(1999, 2009, 2),
        sec.axis = ggplot2::sec_axis(
          ~ .,
          name = ggplot2::element_blank(),
          breaks = seq(1999, 2009, 2)
        )
      ) +
      ggplot2::scale_y_continuous(
        "Swimming pool drownings",
        breaks = seq(0, 1, 1 / 3),
        labels = seq(80, 140, 20),
        limits = c(0, 1),
        sec.axis = ggplot2::sec_axis(
          ~ .,
          name = "Nicolas Cage Films",
          breaks = seq(0, 1, 1 / 3),
          labels = seq(0, 6, 2)
        )
      ) +
      ggplot2::scale_color_manual(
        ggplot2::element_blank(),
        labels = c("Nicholas Cage Films",
                   "Swimming pool drownings"),
        values = c("black",
                   colors$joker_red)
      ) +
      ggplot2::labs(
        title = "Number of people who drowned by falling into a pool",
        subtitle = expression(paste(
          "correlates with ",
          bold("Films Nicolas Cage appeared in"),
          " (r = 0.67)"
        )),
        caption = "Data Sources: Centers for Disease Control & Prevention and Internet Movie Database"
      ) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(color = colors$joker_red),
        plot.caption = ggplot2::element_text(hjust = 0, color = "grey"),
        legend.position = "bottom",
        axis.text.x.top = ggplot2::element_text(color = colors$joker_red),
        axis.title.y.left = ggplot2::element_text(color = colors$joker_red),
        axis.text.y.left = ggplot2::element_text(color = colors$joker_red)
      )
  ) -> tvp
  attr(tvp, 'class') <- c("ggplot_suppressed", class(tvp))
  return(tvp)
}



#' Tyler Vigen Plot 3
#'
#' @description Per capita cheese consumption correlates with Number of people who died by becoming tangled in their bedsheets (r = 0.95).
#' @references
#'   \href{https://www.census.gov/compendia/statab/2012/tables/12s0217.xls}{Department of Agriculture}
#'
#'   \href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention}
#' @keywords cheese, death, bed, sheet
#' @export
tylervigen_plot3 <- function() {
  suppressWarnings(
    whysospurious::tylervigen %>%
      dplyr::select(year, bed_deaths, cheese_percap) %>%
      dplyr::filter(complete.cases(.)) %>%
      dplyr::transmute(
        year = year,
        series_a = (bed_deaths - 200) / (800 - 200),
        series_b = (cheese_percap - 28.5) / (33.5 - 28.5)
      ) %>%
      tidyr::pivot_longer(tidyr::starts_with("series")) %>%
      ggplot2::ggplot(ggplot2::aes(
        x = year, y = value, color = name
      )) +
      ggplot2::geom_point() +
      ggplot2::geom_smooth(
        method = "loess",
        formula = y ~ x,
        span = 0.4,
        se = FALSE
      ) +
      ggplot2::scale_x_continuous(
        ggplot2::element_blank(),
        breaks = seq(2000, 2009, 3),
        sec.axis = ggplot2::sec_axis(
          ~ .,
          name = ggplot2::element_blank(),
          breaks = seq(2000, 2009, 3)
        )
      ) +
      ggplot2::scale_y_continuous(
        "Cheese consumed",
        breaks = seq(0, 1, 1 / 3),
        labels = seq(28.5, 33, 1.5),
        limits = c(0, 1),
        sec.axis = ggplot2::sec_axis(
          ~ .,
          name = "Bedsheet tanglings",
          breaks = seq(0, 1, 1 / 3),
          labels = seq(200, 800, 200)
        )
      ) +
      ggplot2::scale_color_manual(
        ggplot2::element_blank(),
        labels = c("Bedsheet tanglings",
                   "Cheese consumed"),
        values = c("black",
                   colors$joker_red)
      ) +
      ggplot2::labs(
        title = "Per capita cheese consumption",
        subtitle = expression(paste(
          "correlates with ",
          bold("Death by bedsheet entanglement"),
          " (r = 0.95)"
        )),
        caption = "Data Sources: U.S. Department of Agriculture and Centers for Disease Control & Prevention"
      ) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(color = colors$joker_red),
        plot.caption = ggplot2::element_text(hjust = 0, color = "grey"),
        legend.position = "bottom",
        axis.text.x.top = ggplot2::element_text(color = colors$joker_red),
        axis.title.y.left = ggplot2::element_text(color = colors$joker_red),
        axis.text.y.left = ggplot2::element_text(color = colors$joker_red)
      )
  ) -> tvp
  attr(tvp, 'class') <- c("ggplot_suppressed", class(tvp))
  return(tvp)
}



#' Tyler Vigen Plot 9
#'
#' @description People who drowned after falling out of a fishing boat correlates with marriage rate in Kentucky (r = 0.95).
#' @references
#'   \href{https://wonder.cdc.gov/}{Centers for Disease Control & Prevention}
#'
#'   \href{https://www.census.gov/compendia/statab/cats/births_deaths_marriages_divorces/marriages_and_divorces.html}{National Vital Statistics Reports}
#' @keywords drowning, fishing, boat, marriage, Kentucky
#' @export
tylervigen_plot9 <- function() {
  whysospurious::tylervigen %>%
    dplyr::select(year, kentucky_marriage_rate, fishing_drownings) %>%
    dplyr::transmute(
      year = year,
      series_a = (kentucky_marriage_rate - 7) / (11 - 7),
      series_b = (fishing_drownings - 0) / (20 - 0)
    ) %>%
    tidyr::pivot_longer(tidyr::starts_with("series")) %>%
    ggplot2::ggplot(ggplot2::aes(x = year, y = value, color = name)) +
    ggplot2::geom_point() +
    ggplot2::geom_smooth(
      method = "loess",
      formula = y ~ x,
      span = 0.4,
      se = FALSE
    ) +
    ggplot2::scale_x_continuous(
      ggplot2::element_blank(),
      breaks = seq(1999, 2010, 1),
      limits = c(1999, 2010),
      sec.axis = ggplot2::sec_axis(
        ~ .,
        name = ggplot2::element_blank(),
        breaks = seq(1999, 2010, 1)
      )
    ) +
    ggplot2::scale_y_continuous(
      "Fishing boat deaths",
      breaks = seq(0, 1, 1 / 2),
      labels = seq(0, 20, 10),
      limits = c(0, 1),
      sec.axis = ggplot2::sec_axis(
        ~ .,
        name = "Kentucky marriages",
        breaks = seq(0, 1, 1 / 4),
        labels = paste(seq(7, 11, 1), "per 1,000")
      )
    ) +
    ggplot2::scale_color_manual(
      ggplot2::element_blank(),
      labels = c("Kentucky marriages",
                 "Fishing boat deaths"),
      values = c("black",
                 colors$joker_red)
    ) +
    ggplot2::labs(title = "People who drowned after falling out of a fishing boat",
                  subtitle = expression(paste(
                    "correlates with ",
                    bold("Marriage rate in Kentucky"),
                    " (r = 0.95)"
                  )),
                  caption = "Data Sources: Centers for Disease Control & Prevention and National Vital Statistics Reports") +
    ggplot2::theme(
      plot.title = ggplot2::element_text(color = colors$joker_red),
      plot.caption = ggplot2::element_text(hjust = 0, color = "grey"),
      legend.position = "bottom",
      axis.text.x.top = ggplot2::element_text(color = colors$joker_red),
      axis.title.y.left = ggplot2::element_text(color = colors$joker_red),
      axis.text.y.left = ggplot2::element_text(color = colors$joker_red)
    ) -> tvp
  attr(tvp, 'class') <- c("ggplot_suppressed", class(tvp))
  return(tvp)
}