#' @keywords AirNow
#' @export
#' @import dplyr
#' @import MazamaCoreUtils
#' @import MazamaSpatialUtils
#' @title Create dataframes of AirNow site location metadata
#' @param parameters vector of names of desired pollutants or NULL for all pollutants
#' @param pwfslDataIngestSource identifier for the source of monitoring data, e.g. \code{'AIRNOW'}
#' @param addGoogleMeta logicial specifying wheter to use Google elevation and reverse geocoding services
#' @return List of 'meta' dataframes with site metadata for unique parameters (e.g: "PM2.5", "NOX").
#' @description The \code{airnow_createMetaDataframes()} function uses the \code{airnow_downloadSites()} function
#' to download site metadata from AirNow and restructures that data into a format that is compatible
#' with the PWFSLSmoke package \emph{ws_monitor} data model.
#' The \code{meta} dataframe in the \emph{ws_monitor} data model has metadata associated with monitoring
#' site locations for a specific parameter and must contain at least the following columns:
#' \itemize{
#' \item{monitorID -- per deployment unique ID}
#' \item{longitude -- decimal degrees E}
#' \item{latitude -- decimal degrees N}
#' \item{elevation -- height above sea level in meters}
#' \item{timezone -- olson timezone}
#' \item{countryCode -- ISO 3166-1 alpha-2}
#' \item{stateCode -- ISO 3166-2 alpha-2}
#' }
#' The \code{meta} dataframe will have rownames matching \code{monitorID}.
#' This function takes a dataframe obtained from AirNowTech's
#' \code{monitoring_site_locations.dat} file, splits it up into separate dataframes,
#' one for each parameter, and performs the following cleanup:
#' \itemize{
#' \item{convert incorrect values to \code{NA} e.g. longitude=0 & latitude=0}
#' \item{add timezone information}
#' }
#' Parameters included in AirNow data include at least the following list:
#' \enumerate{
#' \item{BARPR}
#' \item{BC}
#' \item{CO}
#' \item{NO}
#' \item{NO2}
#' \item{NO2Y}
#' \item{NO2X}
#' \item{NOX}
#' \item{NOOY}
#' \item{OC}
#' \item{OZONE}
#' \item{PM10}
#' \item{PM2.5}
#' \item{PRECIP}
#' \item{RHUM}
#' \item{SO2}
#' \item{SRAD}
#' \item{TEMP}
#' \item{UV-AETH}
#' \item{WD}
#' \item{WS}
#' }
#' Setting \code{parameters=NULL} will generate a separate dataframe for each of the above parameters.
#' @seealso \link{airnow_downloadSites}
#' @examples
#' \dontrun{
#' # Fail gracefully if any resources are not available
#' try({
#' metaList <- airnow_createMetaDataframes(parameters = "PM2.5")
#' }, silent = FALSE)
#' }
airnow_createMetaDataframes <- function(
parameters = NULL,
pwfslDataIngestSource = 'AIRNOW',
addGoogleMeta = TRUE
) {
logger.debug(" ----- airnow_createMetaDataframes() ----- ")
# ----- Data Download -------------------------------------------------------
logger.trace("Downloading AirNow sites metadata ...")
# Create the tibble that holds a month worth of AirNow data
result <- try( airnowTbl <- airnow_downloadSites(),
if ( "try-error" %in% class(result) ) {
err_msg <- geterrmessage()
logger.error("Unable to obtain sites tibble: %s",err_msg)
stop(paste0("Unable to obtain sites tibble: ",err_msg))
logger.trace("Downloaded %d rows of AirNow sites metadata", nrow(airnowTbl))
# > names(airnowTbl)
# [1] "AQSID" "parameterName" "siteCode" "siteName" "status"
# [6] "agencyID" "agencyName" "EPARegion" "latitude" "longitude"
# [11] "elevation" "GMTOffsetHours" "countryCode" "FIPSCMSACode" "CMSAName"
# [16] "FIPSMSACode" "MSAName" "FIPSStateCode" "stateCode" "GNISCountyCode"
# [21] "countyName" "GNISCityCode" "cityName"
# Get a list of parameters
if ( is.null(parameters) ) {
parameters <- sort(unique(airnowTbl$parameterName))
} else {
# Guarantee that passed in parameters actually exist
parameters <- dplyr::intersect(parameters, unique(airnowTbl$parameterName))
invalidParameters <- dplyr::setdiff(parameters, unique(airnowTbl$parameterName))
if ( length(invalidParameters) > 0 ) {
logger.warn("Requested parameters not found in AirNow sites metadata: %s", paste0(invalidParameters, collapse=", "))
# Filter for parameters
airnowTbl <- dplyr::filter(airnowTbl, airnowTbl$parameterName %in% parameters)
if ( nrow(airnowTbl) == 0 ) {
parametersString <- paste(parameters, collapse=", ")
logger.error("No available sites for: %s",parametersString)
stop(paste0("No available sites for: ",parametersString))
# ----- Data cleanup --------------------------------------------------------
# Convert "O3" to "OZONE" as is used in all AirNow data files
airnowTbl$parameterName[airnowTbl$parameterName == "O3"] <- "OZONE"
parameters[parameters == "O3"] <- "OZONE"
# Remove ' ' from the end of MSAName
airnowTbl$MSAName <- stringr::str_trim(airnowTbl$MSAName)
# Remove bad stateCodes
mask <- airnowTbl$stateCode %in% c('N/A')
airnowTbl$stateCode[mask] <- as.character(NA)
# Remove bad countyNames
mask <- airnowTbl$countyName %in% c('N/A')
airnowTbl$countyName[mask] <- as.character(NA)
# Convert countyName from all caps to title case
airnowTbl$countyName <- stringr::str_to_title(airnowTbl$countyName)
# NOTE: Don't stringr::str_to_title(siteName) because it might include all caps identifiers like "USFS"
# Remove bad locations
mask <- airnowTbl$longitude == 0 & airnowTbl$latitude == 0
badLocationIDs <- paste(airnowTbl$AQSID[mask], collapse=", ")
logger.trace("Replacing (0,0) locations with (NA,NA) for AQSIDs: %s", badLocationIDs)
airnowTbl$longitude[mask] <- as.numeric(NA)
airnowTbl$latitude[mask] <- as.numeric(NA)
# Remove bad elevations (zero seems to be used as a missing value flag)
mask <- airnowTbl$elevation <= 0.0
airnowTbl$elevation[mask] <- as.numeric(NA)
airnowTbl$elevation <- round(airnowTbl$elevation, 0) # round to whole meters
# ----- Subset and add Mazama metadata and USGS elevation -------------------
# Restrict to North America
CANAMEX <- c('CA','US','MX')
airnowTbl <- dplyr::filter(airnowTbl, airnowTbl$countryCode %in% CANAMEX)
# For later testing
old_airnowTbl <- airnowTbl
# Do spatial searching only for unique locations to speed things up
sitesUnique <- airnowTbl[!duplicated(airnowTbl$AQSID),]
sitesUnique <- addMazamaMetadata(sitesUnique, 'longitude', 'latitude', countryCodes = CANAMEX)
# TODO: Handle addGoogleMeta
# if ( addGoogleMeta ) {
# # Add elevation, siteName and countyName
# sitesUnique <- addGoogleElevation(sitesUnique, 'longitude', 'latitude')
# sitesUnique <- addGoogleAddress(sitesUnique, 'longitude', 'latitude')
# }
# Now add the per-AQSID Mazama metadata to the larger dataframe
# NOTE: We need to remove the columns from airnowTbl that we replace with left_join()
airnowTbl$countryCode <- NULL
airnowTbl$stateCode <- NULL
airnowTbl <- dplyr::left_join(airnowTbl, sitesUnique[,c('AQSID','countryCode','stateCode','timezone')], by='AQSID')
# Sanity check
if ( any(airnowTbl$countryCode != old_airnowTbl$countryCode) ) {
indices <- which(airnowTbl$countryCode != old_airnowTbl$countryCode)
AQSIDString <- paste0(airnowTbl$AQSID[indices], collapse=", ")
logger.trace('addMazamaMetadata() changed the countryCode for AQSIDs: %s', AQSIDString)
# Next line is for debugging
mzm_airnowTbl <- airnowTbl
# NOTE: Neither the simpleCountriesEEZ nor the NaturalEarthAdm1 datasets used in MazamaSpatialUtils
# NOTE: is of high enough resolution to accurately assign states and countries for sites on
# NOTE: wiggly rivers near boundaries.
# NOTE: Although there are some obvious mismatches between locations and sitenames in the AirNow data,
# NOTE: we will use their countryCode and stateCode where they disagree with MazamaSpatialUtils.
airnowTbl$countryCode[indices] <- old_airnowTbl$countryCode[indices]
airnowTbl$stateCode[indices] <- old_airnowTbl$stateCode[indices]
# NOTE: Best guess is that at this point all monitors currently reporting
# NOTE: from US.MX are actually in US.TX
# NOTE: CA.CC monitors are tougher
# NOTE: The AirNow sites file has at least half a dozen mismatches between lat-lon and asociated location information
mask <- airnowTbl$countryCode == 'US' & airnowTbl$stateCode == 'MX'
airnowTbl$stateCode[mask] <- 'TX'
# Set any remaining invalid stateCodes to NA
mask <- airnowTbl$stateCode %in% c('CC')
airnowTbl$stateCode[mask] <- as.character(NA)
# ----- Data Reshaping ------------------------------------------------------
logger.trace("Reshaping AirNow sites metadata ...")
# Create empty list (no pre-allocation needed when lists are referenced by key instead of integer)
dfList <- list()
# Use dplyr to seprate the data by parameter
for ( parameter in parameters ) {
# Create a tbl with unique sites for this parameter
tbl <- dplyr::filter(airnowTbl, airnowTbl$parameterName == parameter) %>%
# Our tibble now contains the following columns:
# > names(airnowTbl)
# [1] "AQSID" "parameterName" "siteCode" "siteName" "status"
# [6] "agencyID" "agencyName" "EPARegion" "latitude" "longitude"
# [11] "elevation" "GMTOffsetHours" "FIPSCMSACode" "CMSAName" "FIPSMSACode"
# [16] "MSAName" "FIPSStateCode" "GNISCountyCode" "countyName" "GNISCityCode"
# [21] "cityName" "countryCode" "stateCode" "timezone"
# The PWFSLSmoke v1.0 data model contains the following parameters
# > names(meta)
# [1] "monitorID" "longitude" "latitude" "elevation"
# [5] "timezone" "countryCode" "stateCode" "siteName"
# [9] "agencyName" "countyName" "msaName" "monitorType"
# [13] "siteID" "instrumentID" "aqsID" "pwfslID"
# [17] "pwfslDataIngestSource" "telemetryAggregator" "telemetryUnitID"
meta <- createEmptyMetaDataframe(nrow(tbl))
# Assign data where we have it
meta$longitude <- as.numeric(tbl$longitude)
meta$latitude <- as.numeric(tbl$latitude)
meta$elevation <- as.numeric(tbl$elevation)
meta$timezone <- as.character(tbl$timezone)
meta$countryCode <- as.character(tbl$countryCode)
meta$stateCode <- as.character(tbl$stateCode)
meta$siteName <- as.character(tbl$siteName)
meta$countyName <- as.character(tbl$countyName)
meta$msaName <- as.character(tbl$MSAName)
meta$agencyName <- as.character(tbl$agencyName)
meta$monitorType <- as.character(NA)
meta$siteID <- as.character(tbl$AQSID)
meta$instrumentID <- "01" # Monitors are not identified and we should only have a single site-monitor
meta$aqsID <- as.character(tbl$AQSID)
meta$pwfslID <- as.character(NA)
meta$pwfslDataIngestSource <- as.character(pwfslDataIngestSource)
meta$telemetryAggregator <- as.character(NA)
meta$telemetryUnitID <- as.character(NA)
meta$monitorID <- paste(meta$siteID, meta$instrumentID, sep='_')
# Assign rownames
rownames(meta) <- meta$monitorID
dfList[[parameter]] <- meta
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.