rOstluft.plot: Ploting air quality data

Documented in summary_wind_2d

#' Summarise y values over binned wind data, split into u and v components.
#'
#' Input data should be original unbinned data including wind direction and wind velocity;
#' binning is done 2-dimensional over cartesian u and v wind vectors
#'
#' @param data a data.frame or tibble containing the data (wide format).
#' requires input data including at least three columns carrying information regarding:
#' * wind direction (in °)
#' * wind velocity
#' * z-values (e.g. air pollutant concentration)
#' @param ws symbol giving the wind velocity parameter name (wind velocity preferably in m/s)
#' @param wd symbol giving the wind direction parameter name  in degrees
#' @param z symbol giving the parameter name to be summarised
#' @param groupings additional groupings. Use helper [grp()] to create
#' @param fun function or list of functions for summary.
#' @param fun.args a list of extra arguments to pass to fun.
#' @param nmin numeric, minimum number of values for fun, if n < nmin: NA is returned
#' @param ws_max numeric or Inf, maximum wind velocity for binning: above ws_max, z is set NA
#' @param bins numeric, number of bins over the range of values if `!groups %in% c("u", "v")`
#' @param smooth TRUE/FALSE, applies if groups = c("u", "v"); should smoothing of summary results should be performed
#' using [fit_gam_surface()]?
#' @param k numeric, applies if smooth = TRUE; degree of smoothing in smooth term in [fit_gam_surface()]
#' @param extrapolate TRUE/FALSE, applies if smooth = TRUE; [fit_gam_surface()] returns extrapolated (predicted) values for u, v coordinates that otherwise would have have NA for summarised z
#' if extrapolate = TRUE, those values are returned (to a certain degree depending on the value of dist)
#' @param dist numeric, fraction of 1, applies if smooth = TRUE and extrapolate = TRUE; maximum distance to next coordinate-pair at which the result of
#' fit_gam_surface(z) should be returned
#'
#' @return a tibble with summarised data along u and v wind vectors
#'
#' @section Computed variables:
#'
#' * a tibble is returned, binned over u and v, with variables:
#' - wd: wind direction corresponding to midpoint value of u and v
#' - ws: wind velocity corresponding to midpoint value of u and v
#' - u: midpoints of bins over u (from input wd and ws)
#' - v: midpoints of bins over v (from input wd and ws)
#' - z: result from fun(z, ...)
#'
#' @export
#'
#' @examples
#' library(ggplot2)
#' fn <- rOstluft.data::f("Zch_Stampfenbachstrasse_2010-2014.csv")
#' data <- rOstluft::read_airmo_csv(fn)
#' data <- rOstluft::rolf_to_openair(data)
#'
#' # summary NO2
#' summary_wind_2d(data, ws, wd, NOx, smooth = FALSE)
#'
#' # multiple stats: Pass function, by name, reference, as function or one sided formula
#' funs <- list(
#'   "mean",
#'   "median" = function(x) median(x, na.rm = TRUE),
#'   "q95" = ~ stats::quantile(., probs = 0.95)
#' )
#'
#' summary_wind_2d(data, ws, wd, NOx, fun = funs, smooth = FALSE)
#'
#' # is for some reason fun.args used with multiple functions, use ... to catch
#' # superfluous arguments:
#' funs <- list(
#'   "q95" = function(x, ...) stats::quantile(x, probs = 0.95),
#'   "mean"
#' )
#'
#' summary_wind_2d(data, ws, wd, NOx, fun = funs, fun.args = list(na.rm = TRUE),
#'                 smooth = FALSE)
#'
#' # additional groupings
#' summary_wind_2d(data, ws, wd, NOx, groupings = grp(site), smooth = FALSE)
#'
#' # we can use expressions in grp For better readability groupings is
#' # defined outside of the function call
#' groupings = grp("site", year = lubridate::year(date))
#'
#' summary_wind_2d(data, ws, wd, NOx, groupings = groupings, smooth = FALSE)
#'
#' # smoothing
#' df1 <- summary_wind_2d(data, ws, wd, NOx, bins = 100^2, smooth = FALSE)
#' df2 <- summary_wind_2d(data, ws, wd, NOx, bins = 100^2, extrapolate = FALSE)
#' df3 <- summary_wind_2d(data, ws, wd, NOx, bins = 100^2, smooth = TRUE)
#'
#' df <- dplyr::bind_rows(
#'   "smooth = F" = df1,
#'   "smooth = T, extrapolate = F" = df2,
#'   "smooth = T, extrapolate = T" = df3,
#'   .id = "smoothing"
#' )
#'
#' ggplot(df, aes(x = u, y = v, fill = NOx)) +
#'   coord_fixed(expand = FALSE) +
#'   lims(x = c(-7.5, 7.5), y = c(-7.5, 7.5)) +
#'   geom_raster() +
#'   scale_fill_viridis_c(na.value = NA) +
#'   facet_wrap(vars(smoothing), ncol = 2)
#'
#' # for a small number of bins reduce k
#' summary_wind_2d(data, ws, wd, NO2, bins = 5^2, smooth = TRUE, k = 5)
summary_wind_2d <- function(data, ws, wd, z, groupings = grp(), fun = "mean", fun.args = list(), nmin = 3, ws_max = NA,
                            bins = 10^2, smooth = TRUE, k = 100, extrapolate = TRUE, dist = 0.1) {

  ws <- rlang::ensym(ws)
  wd <- rlang::ensym(wd)
  z <- rlang::ensym(z)

  # rename z if needed. we can't apply summarize functions on grouping columns!
  # for ws and wd we do auto renaming.
  if (ws == z) {
    z <- rlang::sym(stringr::str_c(rlang::as_string(ws), ".stat"))
    data <- dplyr::mutate(data, !!z := !!ws)
  }

  if (wd == z) {
    z <- rlang::sym(stringr::str_c(rlang::as_string(wd), ".stat"))
    data <- dplyr::mutate(data, !!z := !!wd)
  }

  fun <- auto_name(c(fun, "n" = function(x, ...) {sum(!is.na(x))}))

  data <- dplyr::mutate(data,
      u = !!ws * sin(pi * !!wd / 180),
      v = !!ws * cos(pi * !!wd / 180)
  )

  uv_max <- ifelse(is.na(ws_max), max(abs(c(data$u, data$v)), na.rm = TRUE), ws_max)
  nbins <- round(sqrt(bins), 0)
  binwidth <- 2 * uv_max / nbins
  uv_breaks <- seq(-uv_max, uv_max, by = binwidth)

  data <- dplyr::mutate(data,
    u = cut(.data$u, breaks = uv_breaks, include.lowest = TRUE),
    v = cut(.data$v, breaks = uv_breaks, include.lowest = TRUE)
  )

  # filter NAs, ith stats::na.omit() every row containing a NA value will be removed
  data <- dplyr::filter(data, !(is.na(.data$u) | is.na(.data$v) | is.na(!!z)))

   # apply the summarize function regarding the addiotional grouping columns
  data <- dplyr::group_by(data, .data$u, .data$v, !!!groupings)

  data <- dplyr::summarise_at(data,
    .vars = dplyr::vars(!!z),
    .funs = fun,
    !!!fun.args
  )
  data <- dplyr::ungroup(data)

  # calculate freqency per groupings
  data <- dplyr::group_by(data, !!!rlang::syms(names(groupings)))
  data <- dplyr::mutate(data, freq = .data$n / sum(.data$n, na.rm = TRUE))
  data <- dplyr::ungroup(data)

  data <- dplyr::filter(data, .data$n >= nmin)
  data <- tidyr::gather(data, "stat", !!z, -"u", -"v", -"n", -"freq",
                        -dplyr::one_of(names(groupings)))

  # ensure that for every combination of u, v, stat and groupings a value is present
  # predict in fit_gam_surface needs a complete grid
  # geom_raster doesn't like missing raster points
  cols <- c("u", "v", "stat", names(groupings))
  data <- tidyr::complete(data, !!!rlang::syms(cols), fill = list(n = 0, freq = 0))

  # convert the uv factors to the numeric mid point
  data <- dplyr::mutate(data,
    u = -uv_max + binwidth * as.numeric(.data$u) - binwidth / 2,
    v = -uv_max + binwidth * as.numeric(.data$v) - binwidth / 2
  )

  if (isTRUE(smooth)) {
    data <- dplyr::group_by(data, .data$stat, !!!rlang::syms(names(groupings)))
    data <- dplyr::group_modify(data,
      ~ fit_gam_surface(.x, x = "u", y = "v", z = z, weights = pmin(3, .x$n) / 3,
                        k = k, extrapolate = extrapolate, dist = dist)
    )
    data <- dplyr::ungroup(data)
  }

  # calculate wd and ws for the midpoints of uv
  data <- dplyr::mutate(data,
    !!wd := uv2wd(.data$u, .data$v),
    !!ws := sqrt(.data$u^2 + .data$v^2)
  )

  # set z for ws > ws_max to NA
  if (!is.na(ws_max)) {
    data <- dplyr::mutate(data, !!z := dplyr::if_else(!!ws > ws_max + binwidth / 2, NA_real_, !!z))
  }


  return(data)
}