knitr::opts_chunk$set( collapse = TRUE, comment = "#>" )
We can use rsinaica
to find out which city is the most PM10-polluted in all of Mexico.
First, we load the packages:
## Auto-install required R packages packs <- c("dplyr", "ggplot2", "gghighlight", "lubridate", "anomalize", "aire.zmvm", "tidyr", "zoo", "plotly", "rsinaica") success <- suppressWarnings(sapply(packs, require, character.only = TRUE)) if (length(names(success)[!success])) { install.packages(names(success)[!success]) sapply(names(success)[!success], require, character.only = TRUE) }
Then we download the data for the whole year of 2017 using the sinaica_param_data
function. Since the maximum data range we can download is 1 month, we have to use a little mapply magic to download the entire year.
# Download all PM10 pollution data in 2017 pm10_2017 <- bind_rows( mapply(sinaica_param_data, "PM10", seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"), seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1, SIMPLIFY = FALSE) )
There a few cities that collect PM10 data manually (they collect it through a filter and send it to be weighted to an external lab, sometimes in another country). Let's also download that data:
# Download all manually collected PM10 pollution data in 2017 pm10_2017m <- bind_rows( mapply(sinaica_param_data, "PM10", seq(as.Date("2017-01-01"), as.Date("2017-12-01"), by = "month"), seq(as.Date("2017-02-01"), as.Date("2018-01-01"), by = "month") - 1, "Manual", SIMPLIFY = FALSE) ) pm10_2017 <- bind_rows(pm10_2017, pm10_2017m)
This is what the data looks like:
knitr::kable(head(pm10_2017))
Once we've downloaded the data we filter values below 1 µg/m³ since they're probably calibration errors. And we only include stations that reported for more than 80% of days (292). We also have to take into account that PM10 data is measured as a 24 hour rolling average.
# pm10_2017[which(pm10_2017$value_actual != pm10_2017$value_original),] # pm10_2017[which(!is.na(pm10_2017$date_validated)),] ## filter stations that didn't report at least 47 weeks of the year df_filtered <- pm10_2017 %>% mutate(value = if_else(value < 1, NA_real_, value)) %>% group_by(network_name) %>% filter(!is.na(value)) %>% mutate(nweeks = n_distinct(week(date))) %>% filter(nweeks >= 47) %>% select(-nweeks) %>% ungroup() df_max <- df_filtered %>% complete(station_id, hour = 0:23, date = as.character(seq(as.Date("2017-01-01"), as.Date("2017-12-31"), by = "day"))) %>% group_by(station_id, network_name) %>% arrange(station_id, date, hour) %>% mutate(roll24 = rollapply(value, 24, mean, na.rm = TRUE, partial = 18, fill = NA, align = "right")) %>% ungroup() %>% #summarise(mean = mean(value, na.rm = TRUE)) %>% group_by(date, network_name) %>% summarise(max = max(roll24, na.rm = TRUE)) %>% ungroup() %>% add_count(network_name) %>% ## Only include stations that reported for more than 80% of days (292) filter(n >= (365*.8)) %>% select(-n) %>% filter(is.finite(max)) %>% arrange(date)
When plotting the daily 24 hour average maximums we can see that there are still some obvious errors in the data.
ggplot(df_max, aes(as.Date(date), max)) + geom_line(size = .3, color = "black") + facet_wrap(~ network_name, ncol = 3) + ggtitle(expression(paste("Maximum daily ", PM[10], " values in 2017"))) + xlab("date") + ylab(expression(paste("daily maximum 24 average of ", PM[10], " (", mu,"g/", m^3, ")"))) + theme_bw() + theme(axis.text.x=element_text(angle=60,hjust=1))
It looks like we can safely remove values above 500 µg/m³ and get rid of the Aguascalientes and Monclova networks.
We can also use the anomalize
package to detect extreme values, but actually figuring out if they are errors is a little bit more tricky since fires can temporarily spike PM10 levels as often happens during the winter holidays when people burn trash and tires, and set off fireworks. I've opted to remove them, since I'm interested in the average of the whole year these PM10 outliers are unlikely to have a substantial effect on the rankings.
df_max <- df_max %>% filter(!network_name %in% c("Aguascalientes", "Monclova")) %>% filter(max <= 500) %>% ungroup() %>% group_by(network_name) %>% mutate(date = as.Date(date)) %>% time_decompose(max, method = "stl") %>% anomalize(remainder, method = "iqr", alpha = 0.015) %>% time_recompose() ## Anomaly Visualization df_max %>% plot_anomalies(time_recomposed = TRUE, ncol = 3, alpha_dots = 0.25) + labs(title = "Tidyverse Anomalies", subtitle = "STL + GESD Methods") df_max <- filter(df_max, anomaly != "Yes")
And here is the most PM10-polluted city in Mexico: Monterrey
ggplot(df_max, aes(as.Date(date), observed, group = network_name, color = network_name)) + geom_line() + theme_bw() + ggtitle(expression(paste("Pollution measuring network with highest ", PM[10], " pollution values in 2017")), subtitle = "Based on the mean of the highest 24-hour rolling average daily maximums. Source: SINAICA") + xlab("date") + ylab(expression(paste(PM[10], " ", mu,"g/", m^3))) + gghighlight(mean(observed, na.rm = TRUE), max_highlight = 1L)
knitr::kable( df_max %>% group_by(network_name) %>% summarise(mean = mean(observed, na.rm = TRUE)) %>% arrange(-mean) %>% head(10) )
The top 3 most PM10-polluted cities in Mexico:
top3 <- filter(df_max, network_name %in% c("Torreón", "Guadalajara", "Monterrey"))[, 1:3] top3 <- spread(top3, network_name, observed) top3$date <- as.Date(as.Date(top3$date)) x <- list( title = "date" ) y <- list( title = "daily maximum 24 average of PM10" ) plot_ly(as.data.frame(top3), x = ~date, y = ~Monterrey, name = "Monterrey" , type = 'scatter', mode = 'lines', line = list(color = '#e41a1c'), width = .5) %>% add_trace(y = ~Guadalajara, name = 'Guadalajara', mode = 'lines', line = list(color = '#377eb8'), width = .5) %>% add_trace(y = ~Torreón, name = 'Torreón', mode = 'lines', line = list(color = '#4daf4a'), width = .5) %>% layout(title = "Top 3 most PM10-polluted cities in Mexico", xaxis = x, yaxis = y)
Number of days with bad air quality (Índice IMECA MALO)
df_max %>% group_by(network_name) %>% filter(observed > 75.5) %>% summarise(count = n()) %>% arrange(-count) %>% head() %>% knitr::kable()
Number of days with very bad air quality (Índice IMECA MUY MALO).
df_max %>% group_by(network_name) %>% filter(observed > 215.5) %>% summarise(count = n()) %>% arrange(-count) %>% head() %>% knitr::kable()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.