R/aqStats.R
In openair: Tools for the Analysis of Air Pollution Data

Documented in aqStats

#' Calculate summary statistics for air pollution data by year
#'
#' Calculate a range of air pollution-relevant statistics by year.
#'
#' This function calculates a range of common and air pollution-specific
#' statistics from a data frame. The statistics are calculated on an annual
#' basis and the input is assumed to be hourly data. The function can cope with
#' several sites and years e.g. using \code{type = "site"}. The user can control
#' the output by setting \code{transpose} appropriately.
#'
#' Note that the input data is assumed to be in mass units e.g. ug/m3 for all
#' species except CO (mg/m3).
#'
#' The following statistics are calculated:
#'
#' \itemize{ \item \bold{data.capture} --- percentage data capture over a full
#' year.
#'
#' \item \bold{mean} --- annual mean.
#'
#' \item \bold{minimum} --- minimum hourly value.
#'
#' \item \bold{maximum} --- maximum hourly value.
#'
#' \item \bold{median} --- median value.
#'
#' \item \bold{max.daily} --- maximum daily mean.
#'
#' \item \bold{max.rolling.8} --- maximum 8-hour rolling mean.
#'
#' \item \bold{max.rolling.24} --- maximum 24-hour rolling mean.
#'
#' \item \bold{percentile.95} --- 95th percentile. Note that several percentiles
#' can be calculated.
#'
#' \item \bold{roll.8.O3.gt.100} --- number of days when the daily maximum
#' rolling 8-hour mean ozone concentration is >100 ug/m3. This is the target
#' value.
#'
#' \item \bold{roll.8.O3.gt.120} --- number of days when the daily maximum
#' rolling 8-hour mean ozone concentration is >120 ug/m3. This is the Limit
#' Value not to be exceeded > 10 days a year.
#'
#' \item \bold{AOT40} --- is the accumulated amount of ozone over the threshold
#' value of 40 ppb for daylight hours in the growing season (April to
#' September). Note that \code{latitude} and \code{longitude} can also be passed
#' to this calculation.
#'
#' \item \bold{hours.gt.200} --- number of hours NO2 is more than 200 ug/m3.
#'
#' \item \bold{days.gt.50} --- number of days PM10 is more than 50 ug/m3. }
#'
#' For the rolling means, the user can supply the option \code{align}, which can
#' be "centre" (default), "left" or "right". See \code{rollingMean} for more
#' details.
#'
#' There can be small discrepancies with the AURN due to the treatment of
#' rounding data. The \code{aqStats} function does not round, whereas AURN data
#' can be rounded at several stages during the calculations.
#'
#' @param mydata A data frame containing a \code{date} field of hourly data.
#' @param pollutant The name of a pollutant e.g. \code{pollutant = c("o3",
#'   "pm10")}.
#' @param type \code{type} allows [timeAverage()] to be applied to cases where
#'   there are groups of data that need to be split and the function applied to
#'   each group. The most common example is data with multiple sites identified
#'   with a column representing site name e.g. \code{type = "site"}. More
#'   generally, \code{type} should be used where the date repeats for a
#'   particular grouping variable.
#' @param data.thresh The data capture threshold in %. No values are calculated
#'   if data capture over the period of interest is less than this value.
#'   \code{data.thresh} is used for example in the calculation of daily mean
#'   values from hourly data. If there are less than \code{data.thresh}
#'   percentage of measurements available in a period, \code{NA} is returned.
#' @param percentile Percentile values to calculate for each pollutant.
#' @param transpose The default is to return a data frame with columns
#'   representing the statistics. If \code{transpose = TRUE} then the results
#'   have columns for each pollutant-site combination.
#' @param ... Other arguments, currently unused.
#' @export
#' @author David Carslaw
#' @examples
#'
#' ## Statistics for 2004. NOTE! these data are in ppb/ppm so the
#' ## example is for illustrative purposes only
#' aqStats(selectByDate(mydata, year = 2004), pollutant = "no2")
#'
#'
aqStats <- function(mydata,
                    pollutant = "no2",
                    type = "default",
                    data.thresh = 0,
                    percentile = c(95, 99),
                    transpose = FALSE,
                    ...) {
  ## get rid of R check annoyances
  year <- rolling8value <- NULL
  daylight <- NULL
  . <- NULL
  
  # variables we need
  vars <- c("date", pollutant, type)
  
  # cut data by type
  mydata <- cutData(mydata, type)
  
  # check we have teh variables
  mydata <- checkPrep(mydata,
                      vars,
                      "default",
                      remove.calm = FALSE,
                      strip.white = FALSE)
  
  
  # reorganise data
  mydata <-
    gather(mydata, key = pollutant, value = value, pollutant) %>%
    mutate(year = year(date))
  
  
  vars <- c(type, "pollutant", "year")
  
  # calculate the statistics
  results <- mydata %>%
    group_by(across(vars)) %>%
    do(calcStats(., data.thresh = data.thresh,
                 percentile = percentile, ...))
  
  ## transpose if requested
  if (transpose) {
    results <- gather(results,
                      key = variable,
                      value = value,-c(type, pollutant, year, date))
    
    if (type != "default") {
      results <- unite(results, site_pol, type, pollutant)
      
    } else {
      results <- unite(results, site_pol, pollutant)
      
    }
    
    results <- spread(results, site_pol, value)
    
    ## sort out names
    names(results) <- gsub("\\_", " ", names(results))
  }
  
  return(results)
  
}

# function to calculate statistics
calcStats <- function(mydata, data.thresh, percentile, ...) {
  rolling8value <- NULL # keep R check happy
  
  # check to see if dates duplicate
  if (length(unique(mydata$date)) != length(mydata$date))
    warning("Duplicate dates detected, more than one site? Use type = 'site'",
            call. = FALSE)
  
  ## pre-defined list of pollutants that need special treatment
  thePolls <- c("no2", "o3", "pm10", "co")
  
  ## fill any missing hours
  start.date <- floor_date(min(mydata$date), "year")
  end.date <- ceiling_date(max(mydata$date), "year") - 3600
  
  ## find time interval of data and pad any missing times
  interval <- find.time.interval(mydata$date)
  all.dates <-
    data.frame(date = seq(start.date, end.date, by = interval))
  
  # pad out names where needed
  if (nrow(mydata) != nrow(all.dates)) {
    mydata <- full_join(mydata, all.dates, by = "date")
    mydata[setdiff(names(mydata), c("date", "value"))] <-
      mydata[1, setdiff(names(mydata), c("date", "value"))]
  }
  
  
  # statistics
  
  Mean <- timeAverage(
    mydata,
    avg.time = "year",
    statistic = "mean",
    data.thresh,
    print.int = FALSE
  ) %>%
    rename(mean = value)
  
  Min <- timeAverage(
    mydata,
    avg.time = "year",
    statistic = "min",
    data.thresh,
    print.int = FALSE
  ) %>%
    rename(min = value)
  
  Max <- timeAverage(
    mydata,
    avg.time = "year",
    statistic = "max",
    data.thresh,
    print.int = FALSE
  ) %>%
    rename(max = value)
  
  maxDaily <- timeAverage(
    mydata,
    avg.time = "day",
    statistic = "mean",
    data.thresh,
    print.int = FALSE
  ) %>%
    timeAverage(
      avg.time = "year",
      statistic = "max",
      data.thresh,
      print.int = FALSE
    ) %>%
    rename(max_daily = value)
  
  Median <- timeAverage(mydata,
                        avg.time = "year",
                        statistic = "median",
                        data.thresh) %>%
    rename(median = value)
  
  dataCapture <- group_by(mydata, year) %>%
    summarise(date = min(date),
              dat.cap = 100 * length(na.omit(value)) /
                length(value))
  
  
  rollMax8 <- group_by(mydata, year) %>%
    do(
      rollingMean(
        .,
        pollutant = "value",
        data.thresh = data.thresh,
        width = 8,
        new.name = "value",
        ...
      )
    ) %>%
    do(timeAverage(.,
                   avg.time = "year", statistic = "max",
                   data.thresh)) %>%
    rename(roll_8_max = value)
  
  rollMax24 <- group_by(mydata, year) %>%
    do(
      rollingMean(
        .,
        pollutant = "value",
        data.thresh = data.thresh,
        width = 24,
        new.name = "value",
        ...
      )
    ) %>%
    do(timeAverage(.,
                   avg.time = "year", statistic = "max",
                   data.thresh)) %>%
    rename(roll_24_max = value)
  
  Percentile <- group_by(mydata, year) %>%
    do(
      calcPercentile(
        .,
        avg.time = "year",
        pollutant = "value",
        data.thresh = data.thresh,
        percentile = percentile
      )
    )
  
  
  vars <- c("year", "date")
  
  # specific treatment of pollutants
  
  if (length(grep("o3", mydata$pollutant[1], ignore.case = TRUE)) == 1) {
    rollingO3 <- group_by(mydata, year) %>%
      do(rollingMean(., "value", data.thresh = data.thresh, ...)) %>%
      do(timeAverage(
        .,
        avg.time = "day",
        statistic = "max",
        data.thresh = data.thresh
      )) %>%
      summarise(roll.8.O3.gt.100 = length(which(rolling8value > 100)))
    
    
    rollingO3$date <- Mean$date
    
    rollingO3b <- group_by(mydata, year) %>%
      do(rollingMean(., "value", data.thresh = data.thresh, ...)) %>%
      do(timeAverage(
        .,
        avg.time = "day",
        statistic = "max",
        data.thresh = data.thresh
      )) %>%
      summarise(roll.8.O3.gt.120 = length(which(rolling8value > 120)))
    
    rollingO3b$date <- Mean$date
    
    AOT40 <- group_by(mydata, year) %>%
      do(AOT40(., "value"))
    
    AOT40$date <- Mean$date
    
    o3.results <- list(
      dataCapture,
      Mean,
      Min,
      Max,
      Median,
      maxDaily,
      rollMax8,
      rollMax24,
      Percentile,
      rollingO3,
      rollingO3b,
      AOT40
    )
    o3.results <- Reduce(function(x, y)
      merge(x, y,
            by = vars,
            all = TRUE), o3.results)
    
    results <- o3.results
    results
  }
  
  if (length(grep("no2", mydata$pollutant[1], ignore.case = TRUE)) == 1) {
    hours <- group_by(mydata, year) %>%
      summarise(hours = length(which(value > 200)))
    
    hours$date <- Mean$date
    
    no2.results <- list(
      dataCapture,
      Mean,
      Min,
      Max,
      Median,
      maxDaily,
      rollMax8,
      rollMax24,
      Percentile,
      hours
    )
    
    no2.results <- Reduce(function(x, y)
      merge(x, y,
            by = vars,
            all = TRUE), no2.results)
    
    results <- no2.results
    results
  }
  
  if (length(grep("pm10", mydata$pollutant[1], ignore.case = TRUE)) == 1) {
    days <- group_by(mydata, year) %>%
      do(timeAverage(.,
                     avg.time = "day", statistic = "mean", data.thresh)) %>%
      summarise(days = length(which(value > 50)))
    
    
    days$date <- Mean$date
    
    pm10.results <- list(
      dataCapture,
      Mean,
      Min,
      Max,
      Median,
      maxDaily,
      rollMax8,
      rollMax24,
      Percentile,
      days
    )
    
    pm10.results <- Reduce(function(x, y)
      merge(x, y,
            by = vars,
            all = TRUE), pm10.results)
    
    results <- pm10.results
    results
  }
  
  if (length(grep("co", mydata$pollutant[1], ignore.case = TRUE)) == 1) {
    co.results <- list(dataCapture,
                       Mean,
                       Min,
                       Max,
                       Median,
                       maxDaily,
                       rollMax8,
                       rollMax24,
                       Percentile)
    co.results <- Reduce(function(x, y)
      merge(x, y,
            by = vars,
            all = TRUE), co.results)
    results <- co.results
    results
  }
  
  
  ## see if pollutant string in any pre-defined ones
  ## if not calculate basic stats
  if (all(is.na(sapply(thePolls, function(x)
    grep(x, mydata$pollutant[1], ignore.case = TRUE)) > 0))) {
    results <- list(dataCapture,
                    Mean,
                    Min,
                    Max,
                    Median,
                    maxDaily,
                    rollMax8,
                    rollMax24,
                    Percentile)
    
    results <- Reduce(function(x, y)
      merge(x, y,
            by = vars,
            all = TRUE), results)
    
    
    results <- results
  }
  
  
  results
}

AOT40 <- function(mydata, pollutant, ...) {
  ## note the assumption is the O3 is in ug/m3
  daylight <- NULL
  
  ## need daylight hours in growing season (April to September)
  mydata <- selectByDate(mydata, month = 4:9)
  mydata <- cutData(mydata, "daylight", ...)
  mydata <- subset(mydata, daylight == "daylight")
  AOT40 <-
    ifelse(mydata[[pollutant]] - 80 < 0, 0, mydata[[pollutant]] - 80)
  AOT40 <- sum(AOT40, na.rm = TRUE) * 0.50 ## for ppb
  AOT40 <- tibble(AOT40)
  return(AOT40)
}