R/data.R
In rfars: Download and Analyze Crash Data

#' Synonym table for various geographical scales
#'
#' A dataset providing different ways to refer to states and counties.
#'
#' @format A data frame with 3,142 rows and 6 variables:
#' \describe{
#'   \item{fips_state}{2-digit FIPS code indicating a state}
#'   \item{fips_county}{3-digit FIPS code indicating a county within a state}
#'   \item{fips_tract}{6-digit FIPS code indicating a tract within a county}
#'   \item{state_name_abbr}{2-character, capitalized state abbreviation}
#'   \item{state_name_full}{fully spelled and case-sensitive state name}
#'   \item{county_name_abbr}{abbreviated county name (usually minus the word 'County')}
#'   \item{county_name_full}{fully spelled and case-sensitive county name}
#'   \item{region}{fully spelled out and case-sensitive NHTSA region and constituent states}
#'   \item{region_abbr}{abbreviated NHTSA region (ne, mw, s, w)}
#' }
#' @source \url{https://www.census.gov/geographies/reference-files/2015/demo/popest/2015-fips.html}
"geo_relations"



#' FARS Codebook
#'
#' A table describing each FARS variable name, value, and corresponding value label.
#'
#' @format A data frame with 15,951 rows and 19 variables:
#' \describe{
#'    \item{source}{The source of the data (either FARS or GES/CRSS).}
#'    \item{file}{The data file that contains the given variable.}
#'    \item{name_ncsa}{The original name of the data element.}
#'    \item{name_rfars}{The modified data element name used in rfars}
#'    \item{label}{The label of the data element itself (not its constituent values).}
#'    \item{Definition}{The data element's definition, pulled from the Analytical User Manual.}
#'    \item{Additional Information}{Additional information on the data element, pulled from the Analytical User Manual.}
#'    \item{value}{The original value of the data element.}
#'    \item{value_label}{The de-coded value label.}
#'    \item{2014}{Indicator: 1 if valid for 2014, NA otherwise.}
#'    \item{2015}{Indicator: 1 if valid for 2015, NA otherwise.}
#'    \item{2016}{Indicator: 1 if valid for 2016, NA otherwise.}
#'    \item{2017}{Indicator: 1 if valid for 2017, NA otherwise.}
#'    \item{2018}{Indicator: 1 if valid for 2018, NA otherwise.}
#'    \item{2019}{Indicator: 1 if valid for 2019, NA otherwise.}
#'    \item{2020}{Indicator: 1 if valid for 2020, NA otherwise.}
#'    \item{2021}{Indicator: 1 if valid for 2021, NA otherwise.}
#'    \item{2022}{Indicator: 1 if valid for 2022, NA otherwise.}
#'    \item{2023}{Indicator: 1 if valid for 2023, NA otherwise.}
#'    }
#'
#' @details This codebook serves as a useful reference for researchers using FARS data.
#'    The 'source' variable is intended to help combine with the gescrss_codebook.
#'    Data elements are relatively stable but are occasionally discontinued, created anew,
#'    or modified. The 'year' variable helps indicate the availability of data elements,
#'    and differentiates between different definitions over time. Users should always
#'    check for discontinuities when tabulating cases.
#'
#'    The 'file' variable indicates the file in which the given data element originally appeared. Here, files refers to
#'    the SAS files downloaded from NHTSA. Most data elements stayed in their original
#'    file. Those that did not were moved to the multi_ files. For example, 'weather'
#'    originates from the 'accident' file, but appears in the multi_acc data object
#'    created by rfars.
#'
#'    The 'name_ncsa' variable describes the data element's name as assigned
#'    by NCSA (the organization within NHTSA that manages the database). To maximize
#'    compatibility between years and ease of use for programming, 'name_rfars'
#'    provides a cleaned naming convention (via janitor::clean_names()).
#'
#'    Each data element has a 'label', a more human-readable version of the
#'    element names. For example, the label for 'road_fnc' is 'Roadway Function Class'.
#'    These are not definitions but may provide enough information to help users
#'    conduct their analysis. Consult the \href{https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/813706}{Analytical User’s Manual}
#'    for definitions and further details.
#'
#'    'Definition' and 'Additional Information' were extracted from
#'    the \href{https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/813706}{Analytical User’s Manual}.
#'
#'    Each data element has multiple 'value'-'value_label' pairs: 'value' represents
#'    the original, non-human-readable value (usually a number), and 'value_label'
#'    represents the corresponding text value. For example, for 'road_fnc', 1 (the 'value')
#'    corresponds to 'Rural-Principal Arterial-Interstate' (the 'value_label'), 2 corresponds to
#'    'Rural-Principal Arterial-Other', etc.
#'
#'  @source Codebooks are automatically generated by extracting SAS format catalogs
#'    (.sas7bcat files) and VALUE statements from .sas files during data processing,
#'    then consolidating variable names, labels, and value-label mappings across all
#'    years into searchable reference tables. Source files are published by NHTSA and available
#'    \href{https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/}{here}.
#'
#' @seealso "gescrss_codebook"
#'
#' @examples
#' head(rfars::fars_codebook)
"fars_codebook"



#' GESCRSS Codebook
#'
#' A table describing each GESCRSS variable name, value, and corresponding value label.
#'
#' @format A data frame with 34,662 rows and 8 variables:
#' \describe{
#'    \item{source}{The source of the data (either FARS or GESCRSS).}
#'    \item{file}{The data file that contains the given variable.}
#'    \item{name_ncsa}{The original name of the data element.}
#'    \item{name_rfars}{The modified data element name used in rfars}
#'    \item{label}{The label of the data element itself (not its constituent values).}
#'    \item{Definition}{The data element's definition, pulled from the Analytical User Manual}
#'    \item{Additional Information}{Additional information on the data element, pulled from the Analytical User Manual.}
#'    \item{value}{The original value of the data element.}
#'    \item{value_label}{The de-coded value label.}
#'    \item{2014}{Indicator: 1 if valid for 2014, NA otherwise.}
#'    \item{2015}{Indicator: 1 if valid for 2015, NA otherwise.}
#'    \item{2016}{Indicator: 1 if valid for 2016, NA otherwise.}
#'    \item{2017}{Indicator: 1 if valid for 2017, NA otherwise.}
#'    \item{2018}{Indicator: 1 if valid for 2018, NA otherwise.}
#'    \item{2019}{Indicator: 1 if valid for 2019, NA otherwise.}
#'    \item{2020}{Indicator: 1 if valid for 2020, NA otherwise.}
#'    \item{2021}{Indicator: 1 if valid for 2021, NA otherwise.}
#'    \item{2022}{Indicator: 1 if valid for 2022, NA otherwise.}
#'    \item{2023}{Indicator: 1 if valid for 2023, NA otherwise.}
#'    }
#'
#' @details This codebook serves as a useful reference for researchers using GES/CRSS data.
#'    The 'source' variable is intended to help combine with the fars_codebook.
#'    Data elements are relatively stable but are occasionally discontinued, created anew,
#'    or modified. The 'year' variable helps indicate the availability of data elements,
#'    and differentiates between different definitions over time. Users should always
#'    check for discontinuities when tabulating cases.
#'
#'    The 'file' variable indicates the file in which the given data element originally appeared. Here, files refers to
#'    the SAS files downloaded from NHTSA. Most data elements stayed in their original
#'    file. Those that did not were moved to the multi_ files. For example, 'weather'
#'    originates from the 'accident' file, but appears in the multi_acc data object
#'    created by rfars.
#'
#'    The 'name_ncsa' variable describes the data element's name as assigned
#'    by NCSA (the organization within NHTSA that manages the database). To maximize
#'    compatibility between years and ease of use for programming, 'name_rfars'
#'    provides a cleaned naming convention (via janitor::clean_names()).
#'
#'    Each data element has a 'label', a more human-readable version of the
#'    element names. For example, the label for 'harm_ev' is 'First Harmful Event'.
#'    These are not definitions but may provide enough information to help users
#'    conduct their analysis. Consult the \href{https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/813707}{CRSS User Manual}
#'    for definitions and further details.
#'
#'    'Definition' and 'Additional Information' were extracted from
#'    the \href{https://crashstats.nhtsa.dot.gov/Api/Public/ViewPublication/813707}{Analytical User’s Manual}.
#'
#'    Each data element has multiple 'value'-'value_label' pairs: 'value' represents
#'    the original, non-human-readable value (usually a number), and 'value_label'
#'    represents the corresponding text value. For example, for 'harm_ev', 1 (the 'value')
#'    corresponds to 'Rollover/Overturn' (the 'value_label'), 2 corresponds to
#'    'Fire/Explosion', etc.
#'
#'  @source Codebooks are automatically generated by extracting SAS format catalogs
#'    (.sas7bcat files) and VALUE statements from .sas files during data processing,
#'    then consolidating variable names, labels, and value-label mappings across all
#'    years into searchable reference tables. Source files are published by NHTSA and available
#'    \href{https://www.nhtsa.gov/file-downloads?p=nhtsa/downloads/}{here}.
#'
#' @seealso "fars_codebook"
#'
#' @examples
#' head(rfars::gescrss_codebook)
#'
"gescrss_codebook"


#' Annual Crash Counts by Risk Factors
#'
#' Pre-computed annual crash counts from FARS (fatal crashes) and CRSS (general
#' crash estimates) databases for 2014-2023, broken down by various risk factors and
#' vulnerable road user categories.
#'
#' @format A tibble with 340 rows and 9 variables:
#' \describe{
#'   \item{year}{Year (2014-2023)}
#'   \item{month}{Month, if included in interval, as the three-letter abbreviation and an ordered factor (Jan=1, Feb=2, etc.)}
#'   \item{what}{Count unit - currently only "crashes"}
#'   \item{states}{Geographic scope - "all" for national-level data}
#'   \item{region}{Regional scope - "all" for national-level data}
#'   \item{urb}{Urban/rural classification - "all" for combined data}
#'   \item{who}{Person type - "all" for all person types}
#'   \item{involved}{Risk factor or crash type. Options include:
#'     \describe{
#'       \item{"any"}{All crashes (general counts)}
#'       \item{"each"}{Each factor listed below, separately}
#'       \item{"alcohol"}{Alcohol-involved crashes}
#'       \item{"bicyclist"}{Crashes involving bicyclists}
#'       \item{"distracted driver"}{Distracted driving crashes}
#'       \item{"drugs"}{Drug-involved crashes}
#'       \item{"hit and run"}{Hit-and-run crashes}
#'       \item{"large trucks"}{Large truck-involved crashes}
#'       \item{"motorcycle"}{Motorcycle crashes}
#'       \item{"older driver"}{Crashes involving older drivers}
#'       \item{"pedalcyclist"}{Crashes involving pedalcyclists}
#'       \item{"pedbike"}{Pedestrian and bicyclist crashes combined}
#'       \item{"pedestrian"}{Pedestrian crashes}
#'       \item{"police pursuit"}{Police pursuit-related crashes}
#'       \item{"roadway departure"}{Roadway departure crashes}
#'       \item{"rollover"}{Rollover crashes}
#'       \item{"speeding"}{Speed-related crashes}
#'       \item{"young driver"}{Crashes involving young drivers}
#'     }
#'   }
#'   \item{n}{Count of crashes. FARS counts represent actual fatal crashes;
#'             CRSS counts represent weighted estimates of all crashes}
#' }
#'
#' @details
#' This dataset provides quick access to national-level annual crash counts
#' without needing to download and process the full datasets. It combines data
#' from two NHTSA databases:
#'
#' \describe{
#'   \item{\strong{FARS}}{Fatal crashes (actual counts)}
#'   \item{\strong{CRSS}}{General crashes (weighted estimates)}
#' }
#'
#' The data can be reproduced using the \code{counts()} function on downloaded
#' FARS and CRSS data with \code{involved = "any"} and \code{involved = "each"}
#' parameters.
#'
#' @examples
#' \dontrun{
#' # View total crashes over time by data source
#' library(dplyr)
#' library(ggplot2)
#'
#' annual_counts %>%
#'   filter(involved == "any") %>%
#'   ggplot(aes(x = year, y = n, fill = source)) +
#'   geom_col(position = "dodge") +
#'   labs(title = "Annual Crash Counts by Data Source",
#'        x = "Year", y = "Number of Crashes")
#'
#' # Compare risk factor trends in fatal crashes
#' annual_counts %>%
#'   filter(source == "FARS",
#'          involved %in% c("alcohol", "speeding", "distracted driver")) %>%
#'   ggplot(aes(x = year, y = n, color = involved)) +
#'   geom_line() +
#'   labs(title = "Fatal Crash Trends by Risk Factor",
#'        x = "Year", y = "Fatal Crashes")
#' }
#'
#' @seealso \code{\link{counts}} for generating custom counts from downloaded data
#' @keywords datasets
"annual_counts"