In mlysy/aq2020: Statistical Analysis of Air Quality Data

Summary

This document shows how to obtain the pollutant data from the AQO website, convert it to the desired format, calculate p-values, and produce the corresponding plots in the paper.

First let's load the R packages needed to perform all calculations.

# load packages
## require(tidyverse)
require(aq2020)
require(tidyr)
require(dplyr)
require(rvest)
require(lubridate)

General Information about Stations and Pollutants

This will create a data frame with the following columns:

station: The station name.
station_id: The numeric station ID (for webscraping).
pollutant: The pollutant name (O3, PM25, NO2, CO, SO2).
poll_id: The numeric pollutant ID (for webscraping).
has_poll: TRUE/FALSE, depending on whether the given station has records for the given pollutant.

This data frame has already been saved in the aq2020 package and can be accessed as follows:

pollutant_info

The following code shows how to calculate this data from scratch and save it as a CSV file.

WARNING: The dataset pollutant_info in the aq2020 package may differ from a more recent webscrape version using the code below. This is because the AQO periodically updates its station IDs.

# pollutant names and codes
pollutants <- c(O3 = 122, PM25 = 124, NO2 = 36, CO = 46, SO2 = 9)

# station names and ids
stations <- c(
  Barrie = 47045,
  Belleville = 54012,
  Brantford = 21005,
  Burlington = 44008,
  Brampton = 46090,
  Chatham = 13001,
  Cornwall = 56051,
  Grand_Bend = 15020,
  Guelph = 28028,
  Hamilton_Downtown = 29000,
  Hamilton_West = 29118,
  Hamilton_Mountain = 29214,
  Kingston = 52023,
  Kitchener = 26060,
  London = 15026,
  Milton = 44029,
  Mississauga = 46108,
  Newmarket = 48006,
  North_Bay = 75010,
  Oakville = 44017,
  Ottawa_Downtown = 51001,
  Oshawa = 45027,
  Parry_Sound = 49005,
  Peterborough = 59006,
  Port_Stanley = 16015,
  Sarnia = 14111,
  Sault_Ste_Marie = 71078,
  St_Catharines = 27067,
  Sudbury = 77233,
  Thunder_Bay = 63203,
  Toronto_Downtown = 31103,
  Toronto_East = 33003,
  Toronto_West = 35125,
  Windsor_Downtown = 12008,
  Windsor_West = 12016,
  Toronto_North = 34021
)

# keep only the combinations of stations and pollutants
# for which there is data on the AQO website
station_polls <- sapply(stations, function(station_id) {
  ## station <- paste0("http://www.airqualityontario.com/history/station.php?stationid=",
  ##                   station_id,
  ##                   "&submit_station=Choose+Station")
  station <- paste0("http://www.airqualityontario.com/history/station.php?stationid=",
                    station_id)
  station <- read_html(station)
  station %>%
    html_node("#right_column table.resourceTable") %>%
    html_table(header = FALSE) %>%
    as_tibble() %>%
    filter(X1 == "Pollutants Measured:") %>%
    pull(X2)
})
statpoll_hasdata <- sapply(c("O3", "PM2[.]5", "NO2", "CO", "SO2"),
                           function(poll) {
  grepl(poll, station_polls)
})

# combine into a tibble
statpoll <- as_tibble(statpoll_hasdata) %>%
  mutate(station = names(stations),
         station_id = as.numeric(stations)) %>%
  pivot_longer(O3:SO2, names_to = "pollutant", values_to = "has_poll") %>%
  mutate(pollutant = gsub("\\[[.]\\]", "", pollutant),
         poll_id = as.numeric(pollutants[pollutant])) %>%
  select(station, station_id, pollutant, poll_id, has_poll)

# uncomment the line below to save this data as a CSV file
## write_csv(statpoll, path = "aq2020-pollutant_info.csv")

Webscrape the Pollutant Data

For each station and pollutant combination for which there is data on the AQO website, the following code pulls this data from the website by year, with the option to pick only specific months. Here we'll take all available data from 2017-2020.

The first step is to save each webscrape as a file of the form Barrie_NO2_2020_1_12.rds. This is a compressed format consisting here of the NO2 data from the Barrie station for 2020, for months January through December.

Warning: The webscraping below may not provide the same results as those saved in the aq2020 package. This is because the AQO periodically updates the data on its website.

# where to save raw queries
data_path <- file.path("data", "pollutant", "raw")

# helper function to create file names for saving data.
get_filename <- function(path, ..., ext = "rds") {
  fname <- paste0(c(...), collapse = "_")
  file.path(path, paste0(fname, ".", ext))
}

# setup webscrape query information
query_dates <- tibble(
  year = 2017:2020,
  start_month = 1,
  end_month = 12
)
# combine with station/pollutant information
query_info <- pollutant_info %>%
  filter(has_poll) %>% # keep only stat/poll combo for which there is data
  mutate(by = rep(1, n())) %>%
  full_join(query_dates %>%
            mutate(by = rep(1, n()))) %>%
  select(year, start_month, end_month,
         station, station_id, pollutant, poll_id)
nquery <- nrow(query_info)

# helper function to:
#
# - webscrape a query (given station/pollutant/year)
# - format consistently (see [aq2020::get_aqoyr()])
# - save as `data_path/station_pollutant_year_startmonth_endmonth.rds`
#
# if any of the queries fail, it saves the file with the content `NA`.
get_query <- function(query) {
  message("Station: ", query$station,
          ", Pollutant: ", query$pollutant,
          ", Year: ", query$year)
  query_yr <- tryCatch(
    get_aqoyr(station_id = query$station_id,
              poll_id = query$poll_id,
              year = query$year,
              start_month = query$start_month,
              end_month = query$end_month,
              fill = TRUE),
    error = function(e) NA)
  # save data
  saveRDS(query_yr,
          file = get_filename(path = data_path,
                              query$station, query$pollutant,
                              query$year, query$start_month, query$end_month))
}

# number of seconds to wait between website scrapes
wait_time <- c(min = .5, max = 1)

# query subset
# in this case, we just need 2020 as the other years
# were saved in a previous run
sub_query <- which(query_info$year < 2020)

system.time({
  for(ii in sub_query) {
    query <- query_info[ii,]
    get_query(query)
    # wait between queries for a random amount of time
    Sys.sleep(runif(1, min = wait_time["min"], max = wait_time["max"]))
  }
})

Testing

Now let's check that all stations and pollutant datasets have been successfully queried for years 2017-2020:

# check if we have all yearly datasets
sub_query <- which(query_info$year %in% 2017:2020)
bad_ind <- sapply(sub_query, function(ii) {
  query <- query_info[ii,]
  ## message("Station: ", query$station,
  ##         ", Pollutant: ", query$pollutant,
  ##         ", Year: ", query$year)
  query_yr <- readRDS(get_filename(path = data_path,
                                   query$station, query$pollutant,
                                   query$year, query$start_month,
                                   query$end_month))
  all(is.na(query_yr))
})

query_info[sub_query[bad_ind],] %>% print(n = Inf)

We can see that the webscrape failed in a few years for a few of the station/pollutant combinations. We can manually check the AQO website to see what's going on. So for example, to see what happend in Milton for O3 in 2017, take the following URL:

http://www.airqualityontario.com/history/index.php?c=Academic&s={station_id}&y={year}&p={poll_id}&m={start_month}&e={end_month}&t=html&submitter=Search&i=1

and replace the following terms:

{station_id}: The station ID for Milton, which is 44029.
{year}: The year in question, which in this case is 2017.
{poll_id}: The pollutant ID, which in this case is 122.
{start_month}: The start month as a number, which is 1 (January).
{end_month}: The end month as a number, which is 12 (December).

So the URL above becomes:

http://www.airqualityontario.com/history/index.php?c=Academic&s=44029&y=2017&p=122&m=1&e=12&t=html&submitter=Search&i=1

At the time of this writing, following this URL above produces the message:

Sorry no results found for Ozone at Milton(44029) between January and December of the year 2017.

Process the Raw Data

A raw data file from the webscrape looks like this:

query <- query_info[1,] # look at the first query
query_yr <- readRDS(get_filename(path = data_path,
                                 query$station, query$pollutant,
                                 query$year, query$start_month,
                                 query$end_month))
# display
message("Station: ", query$station,
        ", Pollutant: ", query$pollutant,
        ", Year: ", query$year)
as_tibble(query_yr)

Cleaning the raw concentration data involves the following steps:

Replacing erronous hourly concentration values of -999 or 9999 by NA.
Convert hourly concentrations to daily averages.
Combine all stations/pollutants into a single data frame.
Keep only columns Date, Station, Pollutant, and Concentration.

The result of these processing steps are available in the aq2020 package in the pollutant_data object:

pollutant_data

Warning: The 2019 and 2020 data are slightly different now (Jan 23, 2021) from what they were when last scraped (July 7, 2020). The original data are save in the folder /Users/mlysy/Documents/proj/chemODE/covid/data/raw.

# merge webscrape files
pollutant_data <- lapply(which(!bad_ind), function(ii) {
  query <- query_info[ii,]
  readRDS(get_filename(path = data_path,
                       query$station, query$pollutant,
                       query$year, query$start_month,
                       query$end_month)) %>%
    as_tibble() %>%
    mutate(Station = query$station,
           Pollutant = query$pollutant) %>%
    select(Date, Station, Pollutant, H01:H24)
}) %>% bind_rows()

# process
pollutant_data <- pollutant_data %>%
  mutate(Date = ymd(Date)) %>%
  mutate_at(vars(H01:H24), .funs = ~na_if(., 9999)) %>%
  mutate_at(vars(H01:H24), .funs = ~na_if(., -999)) %>%
  pivot_longer(cols = H01:H24, names_to = "Hour",
               values_to = "Concentration") %>%
  group_by(Date, Station, Pollutant) %>%
  summarize(Concentration = mean(Concentration, na.rm = TRUE),
            .groups = "drop") %>%
  rownames(pollutant_data) <- NULL