# Set knitr options
knitr::opts_chunk$set(echo = FALSE,          # echo code so reader can see what is happening
                      warning = FALSE,
                      message = FALSE,
                      fig_caption = TRUE,
                      fig_height = 6,        # default, make it bigger to stretch vertical axis
                      fig_width = 8,
                      fig_width = 8,         # full width
                      tidy = TRUE)           # tidy up code in case echo = TRUE
# Load Packages ----
rmdLibs <- c("ggplot2",
            "kableExtra",
            "plotly",
            "skimr")

dkUtils::loadLibraries(rmdLibs)

# Adjust knitr options if required
knitr::opts_chunk$set(echo = TRUE)

# Log compile time:
startTime <- proc.time()

# Parameters ----
# set xlim for plotly to reduce plot size & load speed
xlimMinDateTime <- lubridate::as_datetime("2020-01-01 00:00:00")
xlimMaxDateTime <- lubridate::as_datetime("2020-07-01 00:00:00")
xlimMinDate <- lubridate::as_date("2020-01-01")
xlimMaxDate <- lubridate::as_date("2020-07-01")

oneYearAgo <- lubridate::as_datetime(now() - (365*24*60*60))

# Functions ----
makeDotPlot <- function(dt, xVar, yVar, byVar, yLab){
  p <- ggplot2::ggplot(dt, aes(x = get(xVar), 
                               y = get(yVar),
                               colour = get(byVar),
                               alpha = 1/100 # https://ggplot2.tidyverse.org/reference/geom_point.html
                               )
                       ) +
    geom_point(size = 1, # small
               shape = 4 # small cross shape
               ) +
    scale_color_discrete(name = eval(byVar)) +
    guides(alpha = FALSE) + # remove alpha legend 
    labs(x = "Time",
         y = eval(yLab)) +  
    theme(legend.position="bottom") +
    guides(colour = guide_legend(nrow = 2)) # forces 2 rows to legend
  return(p)
}

makeTilePlot <- function(dt, xVar, yVar, fillVar, yLab){
  p <- ggplot2::ggplot(dt, aes(x = get(xVar), 
                               y = get(yVar),
                               fill = get(fillVar)
                               )
                       ) +
    geom_tile() +
    scale_fill_continuous(low = "green", high = "red", name = "Value") +
    labs(x = "Time",
         y = yLab) +  
    theme(legend.position="bottom")
  return(p)
}

About

Contributions

Please note that authorship is alphabetical. Contributions are listed below - see github for details and who to blame for what :-).

Code

Citation

If you wish to refer to any of the material from this report please cite as:

Report circulation:

Report purpose:

This work is (c) r format(Sys.time(), format = "%Y") the University of Southampton.

Introduction

Data downloaded from http://southampton.my-air.uk. See also https://www.southampton.gov.uk/environmental-issues/pollution/air-quality/.

Southampton City Council collects various forms of air quality data at the sites shown in \@ref(tab:showSites). WHO publishes information on the health consequences and "acceptable" exposure levels for each of these.

lDT <- data.table::melt(origDataDT,
                     id.vars=c("site","dateTimeUTC"),
                     measure.vars = c("co","no2","nox","oz","pm10","pm2_5","so2"),
                     value.name = "value" # varies 
  )

lDT[, obsDate := lubridate::date(dateTimeUTC)]

# remove NA
#lDT <- lDT[!is.na(value)]
t <- lDT[!is.na(value),.(from = min(dateTimeUTC),
           to = max(dateTimeUTC),
           nObs = .N), keyby = .(site, variable)]

kableExtra::kable(t, caption = "Dates data != NA available by site and measure",
                  digits = 2) %>%
  kable_styling()

Summarise data

Summarise previously downloaded and processed data... Note that this may not be completely up to date.

t <- lDT[, .(mean = mean(value, na.rm = TRUE)
            ), keyby = .(site, variable)]

kableExtra::kable(t, caption = "Mean values per site (NaN indicates not measured)") %>%
  kable_styling()

Table \@ref(tab:summariseData) gives an indication of the availability of the different measures.

Analysis

In this section we present graphical analysis of the previoulsy downloaded data. Note this is just a snapshot of the data available.

Nitrogen Dioxide

yLab <- "Nitrogen Dioxide (ug/m3)"
dt <- lDT[variable == "no2"]
t <- dt[, .(mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)), keyby = .(site)]
kableExtra::kable(t, caption = "Summary of NO2 data") %>%
  kable_styling()

Table \@ref(tab:No2Summary) suggests that there may be a few (r nrow(dt[value < 0])) negative values. These are summarised in \@ref(tab:testNegNo2) while Figure \@ref(fig:testNegNo2) shows the availability and levels of the pollutant data over time.

t <- head(dt[value <0], 10)
kableExtra::kable(t, caption = "Negative NO2 values (up to first 10)") %>%
  kable_styling()

t <- table(dt[value < 0]$site)
kableExtra::kable(t, caption = "Negative NO2 values (count by site)") %>%
  kable_styling()

# dt,xvar, yvar,fillVar, yLab
p <- makeTilePlot(dt, xVar = "dateTimeUTC", yVar = "site",
                  fillVar = "value",
                  yLab = yLab)

p

Figure \@ref(fig:plotNo2Daily) shows daily mean values for all sites over time and includes smoother trend lines for each site.

plotDT <- dt[!is.na(value), .(mean = mean(value, na.rm = TRUE)),
             keyby = .(obsDate, site)]

p <- makeDotPlot(plotDT, 
                 xVar = "obsDate", 
                 yVar = "mean", 
                 byVar = "site", 
                 yLab = paste0("Mean daily ", yLab)
                 )

p <- p +
  geom_smooth() + # add smoothed line
  labs(caption = "Trend line = Generalized additive model (gam) with integrated smoothness estimation")

if(doPlotly){
  p
  plotly::ggplotly(p + xlim(xlimMinDate, xlimMaxDate)) # interactive, xlimited # interactive
} else {
  p
}
dt <- dt[dateTimeUTC > oneYearAgo]

t <- dt[value > hourlyNo2Threshold_WHO][order(-value)]

kableExtra::kable(caption = paste0("Values greater than WHO threshold (NO2 > ", 
                                   hourlyNo2Threshold_WHO , ", last 12 months)"), 
                  head(t, 10)) %>%
  kable_styling()


p <- makeDotPlot(dt[!is.na(value)], 
                 xVar = "dateTimeUTC", 
                 yVar = "value", 
                 byVar = "site", 
                 yLab = yLab)

p <- p + geom_hline(yintercept = hourlyNo2Threshold_WHO) +
  labs(caption = "Reference line = WHO hourly threshold")


plotly::ggplotly(p) # for interaction

Figure \@ref(fig:plotNo2Hourly) shows hourly values for all sites for the last 12 months. In this period there were r nrow(t) hours when the hourly Nitrogen Dioxide level breached the relevant WHO hourly threshold (r hourlyNo2Threshold_WHO). The worst 10 cases (if any) are shown in Table \@ref(tab:plotNo2Hourly).

Clearly there are winter peaks, the mean daily values show less variance (and less extremes) than the hourly data and there has also been a decreasing trend over time.

Sulphour Dioxide

yLab <- "Sulphour Dioxide (ug/m3)"
dt <- lDT[variable == "so2"]
t <- dt[, .(mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)), keyby = .(site)]
kableExtra::kable(t, caption = "Summary of SO2 data") %>%
  kable_styling()

Figure \@ref(fig:testNegSo2) shows the availability and levels of the pollutant data over time.

t <- head(dt[value <0], 10)
kableExtra::kable(t, caption = "Negative SO2 values (up to first 10)") %>%
  kable_styling()

t <- table(dt[value < 0]$site)
kableExtra::kable(t, caption = "Negative SO2 values (count by site)") %>%
  kable_styling()

# dt,xvar, yvar,fillVar, yLab
p <- makeTilePlot(dt, xVar = "dateTimeUTC", yVar = "site",
                  fillVar = "value",
                  yLab = yLab)

p

Figure \@ref(fig:plotSo2Daily) shows daily mean values for all sites over time and includes smoother trend lines for each site.

plotDT <- dt[!is.na(value), .(mean = mean(value, na.rm = TRUE)),
             keyby = .(obsDate, site)]

p <- makeDotPlot(plotDT, 
                 xVar = "obsDate", 
                 yVar = "mean", 
                 byVar = "site", 
                 yLab = paste0("Mean daily ", yLab)
                 )

# dailySo2Threshold_WHO

p <- p + geom_hline(yintercept = dailySo2Threshold_WHO) +
  geom_smooth() + # add smoothed line
  labs(caption = "Trend line = Generalized additive model (gam) with integrated smoothness estimation")

if(doPlotly){
  p
  plotly::ggplotly(p + xlim(xlimMinDate, xlimMaxDate)) # interactive, xlimited # interactive
} else {
  p
}
dt <- dt[dateTimeUTC > oneYearAgo]

t <- dt[value > hourlyNo2Threshold_WHO][order(-value)]

kableExtra::kable(caption = paste0("Values greater than WHO threshold (NO2 > ", 
                                   hourlyNo2Threshold_WHO , ", last 12 months)"), 
                  head(t, 10)) %>%
  kable_styling()


p <- makeDotPlot(dt[!is.na(value)], 
                 xVar = "dateTimeUTC", 
                 yVar = "value", 
                 byVar = "site", 
                 yLab = yLab)

p <- p + geom_hline(yintercept = hourlyNo2Threshold_WHO) +
  labs(caption = "Reference line = WHO hourly threshold")


plotly::ggplotly(p) # for interaction

Figure \@ref(fig:plotNo2Hourly) shows hourly values for all sites for the last 12 months. In this period there were r nrow(t) hours when the hourly Nitrogen Dioxide level breached the relevant WHO hourly threshold (r hourlyNo2Threshold_WHO). The worst 10 cases (if any) are shown in Table \@ref(tab:plotNo2Hourly).

Clearly there are winter peaks, the mean daily values show less variance (and less extremes) than the hourly data and there has also been a decreasing trend over time.

PM 10

PM 10 data: has more sensors and wider coverage than PM2.5

yLab <- "PM 10 (ug/m3)"
dt <- lDT[variable == "pm10"]

t <- dt[, .(mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)), keyby = .(site)]
kableExtra::kable(t, caption = "Summary of pm10 data") %>%
  kable_styling()

Table \@ref(tab:pm10Summary) suggests that there may be a few (r nrow(dt[value < 0])) negative values. These are shown in \@ref(tab:testNegPM10) while \@ref(fig:testNegPM10) shows data availability and PM 10 levels over time at each site.

t <- head(dt[value <0], nrow(dt[value < 0]))
kableExtra::kable(head(t), caption = "Negative PM10 values - first 6") %>%
  kable_styling()

p <- makeTilePlot(lDT[variable == "pm10"], xVar = "dateTimeUTC", yVar = "site",
                  fillVar = "value",
                  yLab = yLab)

p
plotDT <- dt[!is.na(value), .(mean = mean(value, na.rm = TRUE)),
             keyby = .(obsDate, site)]

extremePm10Daily <- plotDT[mean > dailyPm10Threshold_WHO][order(-mean)]

kableExtra::kable(caption = paste0("10 highest values greater than WHO threshold (PM 10 > ", 
                                   dailyPm10Threshold_WHO , ")"), 
                  digits = 2,
                  head(extremePm10Daily, 10)) %>%
  kable_styling()

p <- makeDotPlot(plotDT, 
                 xVar = "obsDate", 
                 yVar = "mean", 
                 byVar = "site", 
                 yLab = paste0("Mean daily ", yLab)
                 )

p <- p + 
  geom_hline(yintercept = dailyPm10Threshold_WHO) +
  geom_smooth() + # add smoothed line
  labs(caption = "Trend line = Generalized additive model (gam) with integrated smoothness estimation\nReference line = WHO PM10 mean daily threshold")

if(doPlotly){
  p
  plotly::ggplotly(p + xlim(xlimMinDate, xlimMaxDate)) # interactive, xlimited # interactive
} else {
  p
}

nDaysOverThreshold <- uniqueN(extremePm10Daily$obsDate)
nDays <- uniqueN(plotDT$obsDate) # need to count days not site-days

Figure \@ref(fig:plotPM10Daily) shows daily values for all sites across the entire dataset and indicates the r nDaysOverThreshold days (r round(100*nDaysOverThreshold/nDays,1)%) that breached the WHO PM10 daily mean exposure threshold (r dailyPm10Threshold_WHO) - see Table \@ref(tab:plotPM10Daily).

dt <- dt[dateTimeUTC > oneYearAgo]
t <- dt[value > 100][order(-value)]

kableExtra::kable(caption = "10 highest hourly values (PM 10 > 100)", head(t)) %>%
  kable_styling()

p <- makeDotPlot(dt[!is.na(value)], 
                 xVar = "dateTimeUTC", 
                 yVar = "value", 
                 byVar = "site", 
                 yLab = yLab)

p <- p + labs(caption = "NB: There is no WHO PM10 hourly threshold")

if(doPlotly){
  p
  plotly::ggplotly(p + xlim(xlimMinDateTime, xlimMaxDateTime)) # interactive, xlimited # interactive
} else {
  p
}

Figure \@ref(fig:plotPM10Hourly) shows hourly PM 10 values for all sites over the last 12 months and suggests there may be outliers (see Table \@ref(tab:plotPM10Hourly)).

PM 2.5

yLab <- "PM 2.5 (ug/m3)"
dt <- lDT[variable == "pm2_5"]
t <- dt[!is.na(value), .(mean = mean(value, na.rm = TRUE),
            sd = sd(value, na.rm = TRUE),
            min = min(value, na.rm = TRUE),
            max = max(value, na.rm = TRUE)), keyby = .(site)]
kableExtra::kable(t, caption = "Summary of pm2_5 data") %>%
  kable_styling()

Table \@ref(tab:pm25Summary) suggests that there may be a few (r nrow(dt[value < 0])) negative values. These are shown in Table \@ref(tab:testNegPM25) while Figure \@ref(fig:testNegPM25) shows data availability and PM 2.5 levels over time at each site.

t <- head(dt[value <0], nrow(dt[value < 0]))
kableExtra::kable(head(t), caption = "Negative pm2_5 values - first 6") %>%
  kable_styling()

p <- makeTilePlot(dt, xVar = "dateTimeUTC", yVar = "site",
                  fillVar = "value",
                  yLab = yLab)

p
plotDT <- dt[!is.na(value), .(mean = mean(value, na.rm = TRUE)),
             keyby = .(obsDate, site)]

extremePm25Daily <- plotDT[mean > dailyPm2.5Threshold_WHO][order(-mean)]

kableExtra::kable(caption = paste0("6 highest values greater than WHO threshold (PM 2.5 > ", 
                                   dailyPm2.5Threshold_WHO , ")"), 
                  digits = 2,
                  head(extremePm25Daily)) %>%
  kable_styling()

p <- makeDotPlot(plotDT, 
                 xVar = "obsDate", 
                 yVar = "mean", 
                 byVar = "site", 
                 yLab = paste0("Mean daily ", yLab)
                 )

p <- p + 
  geom_hline(yintercept = dailyPm2.5Threshold_WHO) +
  geom_smooth() + #add smoothed line
  labs(caption = "Trend line = Generalized additive model (gam) with integrated smoothness estimation\nReference line = WHO daily PM2.5 threshold")


if(doPlotly){
  p
  plotly::ggplotly(p + xlim(xlimMinDate, xlimMaxDate)) # interactive, xlimited # interactive
} else {
  p
}

nDaysOverThreshold <- uniqueN(extremePm25Daily$obsDate)
nDays <- uniqueN(plotDT$obsDate) # need to count days not site-days

Figure \@ref(fig:plotPM25Daily) shows daily values for all sites across the dataset and indicates that the WHO PM2_5 daily mean exposure threshold (r dailyPm2.5Threshold_WHO) was breached on r nDaysOverThreshold days (r round(100*nDaysOverThreshold/nDays,1) %). The 6 worst cases are shown in Table \@ref(tab:plotPM25Daily).

dt <- dt[dateTimeUTC > oneYearAgo]
t <- dt[value > 50][order(-value)]

kableExtra::kable(caption = "Extreme hourly values (PM 2.5 > 50, last 12 months, worst 6)", head(t)) %>%
  kable_styling()

p <- makeDotPlot(dt[!is.na(value)], 
                 xVar = "dateTimeUTC", 
                 yVar = "value", 
                 byVar = "site", 
                 yLab = yLab)

p <- p + labs(caption = "NB: There is no WHO PM2.5 hourly threshold")

if(doPlotly){
  p
  plotly::ggplotly(p + xlim(xlimMinDate, xlimMaxDate)) # interactive, xlimited # interactive
} else {
  p
}

Figure \@ref(fig:plotPM25Hourly) shows hourly values for all sites for the last 12 months while Table \@ref(tab:plotPM25Hourly) reports the 6 worst hours.

Observations

Annex

Original data

skimr::skim(origDataDT)

Long form of original data

skimr::skim(lDT)

Nitrogen Dioxide

t <- lDT[variable == "no2"]

skimr::skim(t)
t <- lDT[variable == "no2" & value > hourlyNo2Threshold_WHO][order(-value)]

kableExtra::kable(caption = paste0("Values greater than WHO threshold (NO2 > ", 
                                   hourlyNo2Threshold_WHO , ")"), 
                  t) %>%
  kable_styling()

PM 10

t <- lDT[variable == "pm10"]

skimr::skim(t)
kableExtra::kable(caption = paste0("PM 10 values greater than WHO threshold (NO2 > ", 
                                   hourlyNo2Threshold_WHO , ")"), 
                  extremePm10Daily) %>%
  kable_styling()

PM 2.5

t <- lDT[variable == "pm2_5"]

skimr::skim(t)
kableExtra::kable(caption = paste0("PM 2.5 values greater than WHO threshold (NO2 > ", 
                                   hourlyNo2Threshold_WHO , ")"), 
                  extremePm25Daily) %>%
  kable_styling()

Runtime

Report generated using knitr in RStudio with r R.version.string running on r R.version$platform (r Sys.info()[3]).

t <- proc.time() - startTime

elapsed <- t[[3]]

Analysis completed in r elapsed seconds ( r round(elapsed/60,2) minutes).

R packages used:

References



CfSOtago/airQual documentation built on Nov. 13, 2020, 8:08 a.m.