R/smoothTrend.R

Defines functions fit_smoothtrend_gam deseason_smoothtrend_data prepare_smoothtrend_data smoothTrend

Documented in smoothTrend

#' Calculate nonparametric smooth trends
#'
#' Use non-parametric methods to calculate time series trends
#'
#' The [smoothTrend()] function provides a flexible way of estimating the trend
#' in the concentration of a pollutant or other variable. Monthly mean values
#' are calculated from an hourly (or higher resolution) or daily time series.
#' There is the option to deseasonalise the data if there is evidence of a
#' seasonal cycle.
#'
#' [smoothTrend()] uses a Generalized Additive Model (GAM) from the
#' [mgcv::gam()] package to find the most appropriate level of smoothing. The
#' function is particularly suited to situations where trends are not monotonic
#' (see discussion with [TheilSen()] for more details on this). The
#' [smoothTrend()] function is particularly useful as an exploratory technique
#' e.g. to check how linear or non-linear trends are.
#'
#' 95% confidence intervals are shown by shading. Bootstrap estimates of the
#' confidence intervals are also available through the `simulate` option.
#' Residual resampling is used.
#'
#' Trends can be considered in a very wide range of ways, controlled by setting
#' `type` - see examples below.
#'
#' @inheritParams timePlot
#' @inheritParams timeAverage
#'
#' @param deseason Should the data be de-deasonalized first? If `TRUE` the
#'   function `stl` is used (seasonal trend decomposition using loess). Note
#'   that if `TRUE` missing data are first imputed using a linear regression by month because `stl` cannot handle missing data. In this case the plot shows where the missing data have been imputed as a grey filled circle.
#'
#' @param statistic Statistic used for calculating monthly values. Default is
#'   `"mean"`, but can also be `"percentile"`. See [timeAverage()] for more
#'   details.
#'
#' @param percentile Percentile value(s) to use if `statistic = "percentile"` is
#'   chosen. Can be a vector of numbers e.g. `percentile = c(5, 50, 95)` will
#'   plot the 5th, 50th and 95th percentile values together on the same plot.
#'
#' @param simulate Should simulations be carried out to determine the
#'   Mann-Kendall tau and p-value. The default is `FALSE`. If `TRUE`, bootstrap
#'   simulations are undertaken, which also account for autocorrelation.
#'
#' @param n Number of bootstrap simulations if `simulate = TRUE`.
#'
#' @param autocor Should autocorrelation be considered in the trend uncertainty
#'   estimates? The default is `FALSE`. Generally, accounting for
#'   autocorrelation increases the uncertainty of the trend estimate sometimes
#'   by a large amount.
#'
#' @param ci Should confidence intervals be plotted? The default is `TRUE`.
#'
#' @param k This is the smoothing parameter used by the [mgcv::gam()] function
#'   in package `mgcv`. By default it is not used and the amount of smoothing is
#'   optimised automatically. However, sometimes it is useful to set the
#'   smoothing amount manually using `k`.
#'
#' @export
#' @return an [openair][openair-package] object
#' @author David Carslaw
#' @family time series and trend functions
#' @examples
#' # trend plot for nox
#' smoothTrend(mydata, pollutant = "nox")
#'
#' # trend plot by each of 8 wind sectors
#' \dontrun{
#' smoothTrend(mydata, pollutant = "o3", type = "wd", ylab = "o3 (ppb)")
#'
#' # several pollutants, no plotting symbol
#' smoothTrend(mydata, pollutant = c("no2", "o3", "pm10", "pm25"), shape = NA)
#'
#' # percentiles
#' smoothTrend(mydata,
#'   pollutant = "o3", statistic = "percentile",
#'   percentile = 95
#' )
#'
#' # several percentiles with control over lines used
#' smoothTrend(mydata,
#'   pollutant = "o3", statistic = "percentile",
#'   percentile = c(5, 50, 95), linewidth = c(1, 2, 1), linetype = c(5, 1, 5)
#' )
#' }
smoothTrend <- function(
  mydata,
  pollutant = "nox",
  avg.time = "month",
  data.thresh = 0,
  statistic = "mean",
  percentile = NA,
  k = NULL,
  deseason = FALSE,
  simulate = FALSE,
  n = 200,
  autocor = FALSE,
  type = "default",
  cols = "brewer1",
  ref.x = NULL,
  ref.y = NULL,
  key.columns = 1,
  key.position = "bottom",
  name.pol = NULL,
  date.breaks = 7,
  date.format = NULL,
  auto.text = TRUE,
  ci = TRUE,
  progress = TRUE,
  plot = TRUE,
  key = NULL,
  ...
) {
  # check key.position
  key.position <- check_key_position(key.position, key)

  # extra.args setup
  extra.args <- capture_dots(...)

  # validation checks
  if (length(pollutant) > 1 && length(percentile) > 1) {
    cli::cli_warn(
      "You cannot choose multiple {.field percentiles} and {.field pollutants}; using {.field percentile} = {percentile[1]}."
    )
    percentile <- percentile[1]
  }

  # set stat to percentile if user provides some
  if (!missing(percentile) && statistic != "percentile") {
    cli::cli_inform(
      "{.field percentiles} provided ({percentile}); setting {.field statistic} to 'percentile'."
    )
    statistic <- "percentile"
  }

  # Style controls w/ defaults
  extra.args$linetype <- extra.args$linetype %||% 1L
  extra.args$linewidth <- extra.args$linewidth %||% 1L
  extra.args$shape <- extra.args$shape %||% 1L
  extra.args$size <- extra.args$size %||% 5L

  # label controls
  extra.args$title <- quickText(extra.args$title %||% "", auto.text)
  extra.args$subtitle <- quickText(extra.args$subtitle %||% "", auto.text)
  extra.args$caption <- quickText(extra.args$caption %||% "", auto.text)
  extra.args$tag <- quickText(extra.args$tag, auto.text)
  extra.args$ylab <- quickText(
    extra.args$ylab %||% paste(pollutant, collapse = ", "),
    auto.text
  )
  extra.args$xlab <- quickText(extra.args$xlab %||% NULL, auto.text)

  # find time interval
  # need this because if user has a data capture threshold, need to know
  # original time interval. better to do this before conditioning
  interval <- find_time_interval(mydata$date)

  # equivalent number of days, used to refine interval for month/year
  days <- as.numeric(strsplit(interval, split = " ")[[1]][1]) / 24 / 3600

  # better interval, most common interval in a year
  interval <-
    dplyr::recode_values(
      days,
      31 ~ "month",
      c(365, 366) ~ "year",
      default = interval
    )

  # prepare data
  mydata <-
    prepare_smoothtrend_data(
      mydata,
      pollutant = pollutant,
      type = type,
      avg.time = avg.time,
      statistic = statistic,
      percentile = percentile,
      data.thresh = data.thresh,
      interval = interval,
      progress = progress,
      ...
    )

  # set new variables
  vars <- c(type, "variable")

  # prep data for modelling
  newdata <-
    map_type(
      mydata,
      type = vars,
      fun = \(df) {
        deseason_smoothtrend_data(
          df,
          deseason = deseason,
          pollutant = pollutant
        )
      },
      .include_default = TRUE
    )

  # fit a smooth model
  fit <-
    map_type(
      newdata,
      type = vars,
      fun = \(df) {
        fit_smoothtrend_gam(
          df,
          x = "date",
          y = "conc",
          k = k,
          ...,
          simulate = simulate,
          n.sim = n,
          autocor = autocor
        )
      },
      .include_default = TRUE
    )

  # set fit date to be POSIXct for plotting
  class(fit$date) <- c("POSIXct", "POSIXt")

  # create a guide for both fill and colour using openair styling
  color_guide <-
    ggplot2::guide_legend(
      theme = ggplot2::theme(
        legend.title.position = ifelse(
          key.position %in% c("left", "right"),
          "top",
          key.position
        ),
        legend.text.position = ifelse(
          key.position %in% c("top", "bottom"),
          "right",
          key.position
        )
      ),
      ncol = if (key.position %in% c("left", "right")) {
        key.columns
      } else {
        if (missing(key.columns)) {
          dplyr::n_distinct(newdata$variable)
        } else {
          key.columns
        }
      }
    )

  # create plot
  thePlot <-
    ggplot2::ggplot(mapping = ggplot2::aes(x = .data$date)) +
    ggplot2::geom_line(
      data = newdata,
      ggplot2::aes(
        color = .data$variable,
        linetype = .data$variable,
        linewidth = ggplot2::stage(
          .data$variable,
          after_scale = .data$linewidth * 1 / 2
        ),
        y = .data$conc
      ),
      show.legend = FALSE,
      lineend = extra.args$lineend %||% "butt",
      linejoin = extra.args$linejoin %||% "round",
      linemitre = extra.args$linemitre %||% 10
    ) +
    ggplot2::geom_point(
      data = newdata,
      ggplot2::aes(
        color = .data$variable,
        shape = .data$variable,
        size = .data$variable,
        y = .data$conc
      ),
      show.legend = dplyr::n_distinct(levels(newdata$variable)) > 1
    ) +
    ggplot2::geom_point(
      data = dplyr::filter(newdata, .data$imputed),
      ggplot2::aes(y = .data$conc),
      size = extra.args$cex * 3,
      shape = 21,
      fill = "grey50",
      colour = "grey20",
      show.legend = FALSE
    ) +
    ggplot2::geom_ribbon(
      data = fit,
      ggplot2::aes(
        fill = .data$variable,
        ymax = .data$upper,
        ymin = .data$lower
      ),
      alpha = ifelse(ci, extra.args$alpha %||% 1 / 3, 0),
      show.legend = FALSE
    ) +
    ggplot2::geom_line(
      data = fit,
      ggplot2::aes(
        color = .data$variable,
        y = .data$pred,
        linewidth = .data$variable,
        linetype = .data$variable
      ),
      show.legend = FALSE,
      lineend = extra.args$lineend %||% "butt",
      linejoin = extra.args$linejoin %||% "round",
      linemitre = extra.args$linemitre %||% 10
    ) +
    layer_ref(ref = ref.x, "x", "datetime", tz = lubridate::tz(newdata$date)) +
    layer_ref(ref = ref.y, "y", "numeric") +
    theme_openair(key.position, extra.args = extra.args) +
    ggplot2::scale_fill_manual(
      values = resolve_colour_opts(
        cols,
        dplyr::n_distinct(levels(newdata$variable))
      ),
      breaks = levels(newdata$variable),
      labels = lapply(name.pol %||% levels(newdata$variable), \(x) {
        label_openair(x, auto_text = auto.text)
      }),
      drop = FALSE,
      aesthetics = c("colour", "fill")
    ) +
    ggplot2::scale_shape_manual(
      values = recycle_to_length(
        extra.args$shape,
        dplyr::n_distinct(levels(newdata$variable))
      ),
      breaks = levels(newdata$variable),
      labels = lapply(name.pol %||% levels(newdata$variable), \(x) {
        label_openair(x, auto_text = auto.text)
      }),
      drop = FALSE
    ) +
    ggplot2::scale_size_manual(
      values = recycle_to_length(
        extra.args$size,
        dplyr::n_distinct(levels(newdata$variable))
      ),
      breaks = levels(newdata$variable),
      labels = lapply(name.pol %||% levels(newdata$variable), \(x) {
        label_openair(x, auto_text = auto.text)
      }),
      drop = FALSE
    ) +
    ggplot2::scale_linewidth_manual(
      values = recycle_to_length(
        extra.args$linewidth,
        dplyr::n_distinct(levels(newdata$variable))
      ),
      breaks = levels(newdata$variable),
      labels = lapply(name.pol %||% levels(newdata$variable), \(x) {
        label_openair(x, auto_text = auto.text)
      }),
      drop = FALSE
    ) +
    ggplot2::scale_linetype_manual(
      values = recycle_to_length(
        extra.args$linetype,
        dplyr::n_distinct(levels(newdata$variable))
      ),
      breaks = levels(newdata$variable),
      labels = lapply(name.pol %||% levels(newdata$variable), \(x) {
        label_openair(x, auto_text = auto.text)
      }),
      drop = FALSE
    ) +
    ggplot2::scale_x_datetime(
      breaks = scales::breaks_pretty(date.breaks),
      date_labels = date.format %||% ggplot2::waiver(),
      limits = extra.args$xlim,
      expand = ggplot2::expansion(c(0.02, 0.02))
    ) +
    ggplot2::scale_y_continuous(
      limits = extra.args$ylim
    ) +
    get_facet(
      type = type,
      extra.args = extra.args,
      auto.text = auto.text,
      drop = TRUE,
      wd.res = extra.args$wd.res %||% 8
    ) +
    ggplot2::labs(
      y = extra.args$ylab,
      x = extra.args$xlab,
      title = extra.args$title,
      subtitle = extra.args$subtitle,
      caption = extra.args$caption,
      tag = extra.args$tag,
      color = NULL,
      fill = NULL,
      shape = NULL,
      linetype = NULL,
      linewidth = NULL,
      size = NULL
    ) +
    ggplot2::guides(
      fill = color_guide,
      color = color_guide,
      linetype = color_guide,
      linewidth = color_guide,
      shape = color_guide,
      size = color_guide
    )

  # plot
  if (plot) {
    plot(thePlot)
  }

  # return
  output <- list(
    plot = thePlot,
    data = list(data = newdata, fit = fit),
    call = match.call()
  )
  class(output) <- "openair"
  invisible(output)
}

#' Prepares smoothtrend data - check, cut, timeavg, reshape
#' @noRd
prepare_smoothtrend_data <- function(
  mydata,
  pollutant,
  type,
  avg.time,
  statistic,
  percentile,
  data.thresh,
  interval,
  progress,
  ...
) {
  # default checks
  vars <- c("date", pollutant)
  mydata <- checkPrep(mydata, vars, type, remove.calm = FALSE)

  # cutData depending on type
  mydata <- cutData(mydata, type, ...)

  # reshape data, make sure there is not a variable called 'variable'
  if ("variable" %in% names(mydata)) {
    mydata <- dplyr::rename(mydata, c(variable = "theVar"))
    type <- "theVar"
  }

  # in the case of multiple percentiles, these are assinged and treated
  # like multiple pollutants
  mydata <- tidyr::pivot_longer(
    mydata,
    cols = dplyr::all_of(pollutant),
    names_to = "variable",
    values_to = "value",
    names_transform = list(
      variable = factor
    )
  )

  if (length(percentile) > 1) {
    vars <- c(type, "variable")

    prefix <- paste0(pollutant, " percentile ")

    mydata <- calcPercentile(
      mydata,
      type = vars,
      pollutant = "value",
      avg.time = avg.time,
      percentile = percentile,
      data.thresh = data.thresh,
      prefix = prefix
    )

    vars <- paste0(prefix, percentile)

    mydata <-
      tidyr::pivot_longer(
        dplyr::select(mydata, -"variable"),
        cols = dplyr::all_of(vars),
        names_to = "variable",
        values_to = "value"
      ) |>
      dplyr::arrange(.data$variable) |>
      dplyr::mutate(variable = factor(.data$variable, levels = vars))
  } else {
    mydata <- suppressWarnings(timeAverage(
      mydata,
      type = c(type, "variable"),
      avg.time = avg.time,
      percentile = percentile,
      statistic = statistic,
      data.thresh = data.thresh,
      interval = interval,
      progress = progress
    ))
  }

  # timeAverage drops type if default
  if (length(type) == 1 && type[1] == "default") {
    mydata$default <- "default"
  }

  # return data
  mydata
}

#' Apply deaseason, if requested
#' @noRd
deseason_smoothtrend_data <- function(mydata, deseason, pollutant) {
  # return if nothing to analyse
  if (all(is.na(mydata$value))) {
    return(data.frame(date = NA, conc = NA))
  }

  # sometimes data have long trailing NAs, so start and end at
  # first and last data
  min.idx <- min(which(!is.na(mydata[, "value"])))
  max.idx <- max(which(!is.na(mydata[, "value"])))
  mydata <- mydata[min.idx:max.idx, ]

  # these subsets may have different dates to overall
  start.year <- get_first_year(mydata$date)
  end.year <- get_last_year(mydata$date)
  start.month <- get_first_month(mydata$date)
  end.month <- get_last_month(mydata$date)

  # can't deseason less than 2 years of data
  if (nrow(mydata) <= 24) {
    deseason <- FALSE
  }

  if (deseason) {
    myts <- stats::ts(
      mydata[["value"]],
      start = c(start.year, start.month),
      end = c(end.year, end.month),
      frequency = 12
    )

    was_na <- is.na(myts)
    if (anyNA(myts)) {
      myts <- fill_ts_gaps(myts, pollutant)
    }

    ssd <- stats::stl(myts, s.window = 11, robust = TRUE, s.degree = 1)

    deseas <- ssd$time.series[, "trend"] + ssd$time.series[, "remainder"]
    deseas <- as.vector(deseas)

    results <- data.frame(
      date = mydata$date,
      conc = as.vector(deseas),
      imputed = was_na,
      stringsAsFactors = FALSE
    )
  } else {
    results <- data.frame(
      date = mydata$date,
      conc = mydata[["value"]],
      imputed = FALSE,
      stringsAsFactors = FALSE
    )
  }

  results
}

#' Fit a GAM for smoothTrend
#' @noRd
fit_smoothtrend_gam <- function(
  thedata,
  x = "date",
  y = "conc",
  k = k,
  ...,
  simulate = FALSE,
  n.sim = 200,
  autocor = FALSE,
  se = TRUE,
  level = 0.95,
  n = 100
) {
  ## panel function to add a smooth line to a plot
  ## Uses a GAM (mgcv) to fit smooth
  ## Optionally can plot 95% confidence intervals and run bootstrap simulations
  ## to estimate uncertainties. Simple block bootstrap is also available for correlated data

  data.orig <- thedata ## return this if all else fails

  id <- which(names(thedata) == x)
  names(thedata)[id] <- "x"
  id <- which(names(thedata) == y)
  names(thedata)[id] <- "y"

  # can only fit numeric, so convert back after fitting
  class_x <- class(thedata$x)

  thedata$x <- as.numeric(thedata$x)

  tryCatch(
    {
      if (!simulate) {
        if (is.null(k)) {
          mod <- suppressWarnings(mgcv::gam(
            y ~ s(x),
            select = TRUE,
            data = thedata,
            ...
          ))
        } else {
          mod <- suppressWarnings(mgcv::gam(
            y ~ s(x, k = k),
            select = TRUE,
            data = thedata,
            ...
          ))
        }

        xseq <- seq(
          min(thedata$x, na.rm = TRUE),
          max(thedata$x, na.rm = TRUE),
          length = n
        )

        ## for uncertainties
        std <- stats::qnorm(level / 2 + 0.5)

        pred <- stats::predict(mod, data.frame(x = xseq), se = se)

        results <- data.frame(
          date = xseq,
          pred = pred$fit,
          lower = pred$fit - std * pred$se,
          upper = pred$fit + std * pred$se
        )
      } else {
        ## simulations required

        sam.size <- nrow(thedata)

        xseq <- seq(
          min(thedata$x, na.rm = TRUE),
          max(thedata$x, na.rm = TRUE),
          length = n
        )

        boot.pred <- matrix(nrow = n, ncol = n.sim) # fix: n rows, not sam.size

        ## set up bootstrap
        block.length <- 1

        if (autocor) {
          block.length <- round(sam.size^(1 / 3))
        }
        index <- samp_boot_block(sam.size, n.sim, block.length)

        ## predict first
        if (is.null(k)) {
          mod <- mgcv::gam(y ~ s(x), data = thedata, ...)
        } else {
          mod <- mgcv::gam(y ~ s(x, k = k), data = thedata, ...)
        }

        residuals <- stats::residuals(mod) ## residuals of the model

        pred.input <- stats::predict(mod, thedata)

        for (i in 1:n.sim) {
          new.data <- data.frame(
            x = thedata$x,
            y = pred.input + residuals[index[, i]]
          )

          mod <- mgcv::gam(y ~ s(x), data = new.data, ...)

          pred <- stats::predict(mod, data.frame(x = xseq))

          boot.pred[, i] <- as.vector(pred)
        }

        ## calculate percentiles
        percentiles <- apply(
          boot.pred,
          1,
          function(x) stats::quantile(x, probs = c(0.025, 0.975))
        )

        results <- as.data.frame(cbind(
          date = xseq,
          pred = rowMeans(boot.pred),
          lower = percentiles[1, ],
          upper = percentiles[2, ]
        ))
      }

      # convert class back to original
      class(results[[x]]) <- class_x
      return(results)
    },
    error = function(e) {
      if (nrow(thedata) >= 6) {
        xseq <- seq(
          min(thedata$x, na.rm = TRUE),
          max(thedata$x, na.rm = TRUE),
          length = n
        )
        mod_lo <- stats::loess(y ~ x, data = thedata)
        pred_lo <- stats::predict(
          mod_lo,
          newdata = data.frame(x = xseq),
          se = TRUE
        )
        std <- stats::qnorm(level / 2 + 0.5)
        results <- data.frame(
          tmp = xseq,
          pred = pred_lo$fit,
          lower = pred_lo$fit - std * pred_lo$se.fit,
          upper = pred_lo$fit + std * pred_lo$se.fit
        )
      } else {
        results <- data.frame(
          tmp = thedata$x,
          pred = NA_real_,
          lower = NA_real_,
          upper = NA_real_,
          stringsAsFactors = FALSE
        )
      }
      names(results)[1] <- x
      class(results[[x]]) <- class_x
      results
    }
  )
}

Try the openair package in your browser

Any scripts or data that you put into this service are public.

openair documentation built on May 20, 2026, 5:07 p.m.