In diegovalle/rsinaica: Download Data from Mexico's Air Quality Information System

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

We can use rsinaica to find out which city is the most PM₁₀-polluted in all of Mexico.

First, we load the packages:

## Auto-install required R packages
packs <- c("dplyr", "ggplot2", "gghighlight", "lubridate", "anomalize", 
           "aire.zmvm", "tidyr", "zoo", "plotly", "rsinaica")
success <- suppressWarnings(sapply(packs, require, character.only = TRUE))
if (length(names(success)[!success])) {
  install.packages(names(success)[!success])
  sapply(names(success)[!success], require, character.only = TRUE)
}

Download

Then we download the data for the whole year of 2017 using the sinaica_param_data function. Since the maximum data range we can download is 1 month, we have to use a little mapply magic to download the entire year.

# Download all PM10 pollution data in 2017
pm10_2017 <- bind_rows(
  mapply(sinaica_param_data,
         "PM10",
         seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"),
         seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1,
         SIMPLIFY = FALSE)
)

There a few cities that collect PM₁₀ data manually (they collect it through a filter and send it to be weighted to an external lab, sometimes in another country). Let's also download that data:

# Download all manually collected PM10 pollution data in 2017
pm10_2017m <- bind_rows(
  mapply(sinaica_param_data,
         "PM10",
         seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"),
         seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1,
         "Manual",
         SIMPLIFY = FALSE)
)
pm10_2017 <-  bind_rows(pm10_2017, pm10_2017m)

This is what the data looks like:

knitr::kable(head(pm10_2017))

Cleanup

Once we've downloaded the data we filter values below 1 µg/m³ since they're probably calibration errors. And we only include stations that reported for more than 80% of days (292). We also have to take into account that PM₁₀ data is measured as a 24 hour rolling average.

# pm10_2017[which(pm10_2017$value_actual != pm10_2017$value_original),]
# pm10_2017[which(!is.na(pm10_2017$date_validated)),]

## filter stations that didn't report at least 47 weeks of the year
df_filtered <- pm10_2017 %>%
  mutate(value = if_else(value < 1, NA_real_, value)) %>%
  group_by(network_name) %>%
  filter(!is.na(value)) %>%
  mutate(nweeks = n_distinct(week(date))) %>%
  filter(nweeks >= 47) %>%
  select(-nweeks) %>%
  ungroup()



df_max <- df_filtered %>%
  complete(station_id,
           hour = 0:23,
           date = as.character(seq(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))) %>%
  group_by(station_id, network_name) %>%
  arrange(station_id, date, hour) %>%
  mutate(roll24 = rollapply(value, 24, mean, na.rm = TRUE, partial = 18, 
                            fill = NA, align = "right")) %>%
  ungroup() %>%
  #summarise(mean = mean(value, na.rm = TRUE)) %>%
  group_by(date, network_name) %>%
  summarise(max = max(roll24, na.rm = TRUE)) %>%
  ungroup() %>%
  add_count(network_name) %>%
  ## Only include stations that reported for more than 80% of days (292)
  filter(n >= (365*.8))  %>%
  select(-n) %>%
  filter(is.finite(max)) %>%
  arrange(date)

When plotting the daily 24 hour average maximums we can see that there are still some obvious errors in the data.

ggplot(df_max, aes(as.Date(date), max)) +
  geom_line(size = .3, color = "black") +
  facet_wrap(~ network_name, ncol = 3) +
  ggtitle(expression(paste("Maximum daily ", 
                           PM[10], " values in 2017"))) +
  xlab("date") +
  ylab(expression(paste("daily maximum 24 average of ", PM[10], 
                        " (", mu,"g/", m^3, ")"))) +
  theme_bw() +
  theme(axis.text.x=element_text(angle=60,hjust=1))

It looks like we can safely remove values above 500 µg/m³ and get rid of the Aguascalientes and Monclova networks.

Anomalies

We can also use the anomalize package to detect extreme values, but actually figuring out if they are errors is a little bit more tricky since fires can temporarily spike PM₁₀ levels as often happens during the winter holidays when people burn trash and tires, and set off fireworks. I've opted to remove them, since I'm interested in the average of the whole year these PM₁₀ outliers are unlikely to have a substantial effect on the rankings.

df_max <- df_max %>%
  filter(!network_name %in% c("Aguascalientes", "Monclova")) %>%
  filter(max <= 500) %>%
  ungroup() %>%
  group_by(network_name) %>%
  mutate(date = as.Date(date)) %>%
  time_decompose(max, method = "stl") %>%
  anomalize(remainder, method = "iqr", alpha = 0.015) %>%
  time_recompose()


## Anomaly Visualization
df_max %>% plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) +
  labs(title = "Tidyverse Anomalies", subtitle = "STL + GESD Methods") 

df_max <- filter(df_max, anomaly != "Yes")

Most PM₁₀-Polluted City

And here is the most PM₁₀-polluted city in Mexico: Monterrey

ggplot(df_max, aes(as.Date(date), observed,
                   group = network_name, color = network_name)) +
  geom_line()  +
  theme_bw() +
  ggtitle(expression(paste("Pollution measuring network with highest ", 
                           PM[10], " pollution values in 2017")),
          subtitle = "Based on the mean of the highest 24-hour rolling average daily maximums. Source: SINAICA") +
  xlab("date") +
  ylab(expression(paste(PM[10], " ", mu,"g/", m^3))) +
  gghighlight(mean(observed, na.rm = TRUE), max_highlight = 1L)

Top polluted cities

knitr::kable(
  df_max %>%
    group_by(network_name) %>%
    summarise(mean = mean(observed, na.rm = TRUE)) %>%
    arrange(-mean) %>%
    head(10)
  )

The top 3 most PM₁₀-polluted cities in Mexico:

top3 <- filter(df_max, network_name %in% c("Torreón",
                                           "Guadalajara",
                                           "Monterrey"))[, 1:3]

top3 <- spread(top3, network_name, observed)
top3$date <- as.Date(as.Date(top3$date))
x <- list(
  title = "date"
)
y <- list(
  title = "daily maximum 24 average of PM10"
)

plot_ly(as.data.frame(top3), x = ~date, y = ~Monterrey, name = "Monterrey" ,
        type = 'scatter', mode = 'lines', line = list(color = '#e41a1c'), width = .5) %>%
  add_trace(y = ~Guadalajara, name = 'Guadalajara', mode = 'lines', line = list(color = '#377eb8'), width = .5) %>%
  add_trace(y = ~Torreón, name = 'Torreón', mode = 'lines', line = list(color = '#4daf4a'), width = .5) %>%
  layout(title = "Top 3 most PM10-polluted cities in Mexico", xaxis = x, yaxis = y)

Days with bad air quality

Number of days with bad air quality (Índice IMECA MALO)

df_max %>%
  group_by(network_name) %>%
  filter(observed > 75.5) %>%
  summarise(count = n()) %>%
  arrange(-count) %>%
  head() %>%
  knitr::kable()

Number of days with very bad air quality (Índice IMECA MUY MALO).

df_max %>%
  group_by(network_name) %>%
  filter(observed > 215.5) %>%
  summarise(count = n()) %>%
  arrange(-count) %>%
  head() %>%
  knitr::kable()