R/quality_control.R

## =============================================================================
#' Flag Based on Static Threshold Values
#'
#' Values of \code{x} are checked against two specified thresholds to obtain
#' their quality control (qc) flags.
#'
#' This function is called by \code{\link{extract_qc}} but can be useful on its
#' own when filtering values of variable according to the 0 - 2 qc flag scheme.
#'
#' Obtained qc flags are assigned in the range 0 - 2 according to these rules:
#'
#' For \code{flag = "higher"}
#' \itemize{
#' \item If \code{x <= thr[1]}, QC flag = 0.
#' \item If \code{x > thr[1] & x <= thr[2]}, QC flag = 1.
#' \item If \code{x > thr[2]}, QC flag = 2.
#' }
#'
#' For \code{flag = "outside"}
#' \itemize{
#' \item If \code{x >= thr[1] & x <= thr[2]}, QC flag = 0.
#' \item If \code{x < thr[1] | x > thr[2]}, QC flag = 2.
#' }
#'
#' For \code{flag = "between"}
#' \itemize{
#' \item If \code{x <= thr[1] | x >= thr[2]}, QC flag = 0.
#' \item If \code{x > thr[1] & x < thr[2]}, QC flag = 2.
#' }
#'
#' For \code{flag = "lower"}
#' \itemize{
#' \item If \code{x >= thr[1]}, QC flag = 0.
#' \item If \code{x < thr[1] & x >= thr[2]}, QC flag = 1.
#' \item If \code{x < thr[2]}, QC flag = 2.
#' }
#'
#' @details
#' This is a modified version of the function of the same name in the R package
#' "openeddy".
#'
#' @return An integer vector with the same length as \code{x}. Its
#'   \code{varnames} and \code{units} attributes are set to  \code{name_out} and
#'   \code{"-"} values, respectively.
#'
#' @param x A numeric atomic type with \code{NULL} \code{dimensions}.
#' @param thr A numeric vector with 2 non-missing values.
#' @param flag A character string. Selects one of the available flagging
#'   approaches. Can be abbreviated. See 'Details'.
#'
flag_thr <- function(x, thr,
                     flag = c("higher", "outside", "between", "lower")) {
  # Check inputs
  check_input(x, c("numeric", "no_dims"))
  check_input(thr, c("numeric", "length_2", "all_finite"))
  flag <- match.arg(flag)
  out <- rep(NA, length(x))
  if (flag == "higher") {
    if (thr[1] > thr[2]) stop("'thr[1]' cannot be higher than 'thr[2]'.")
    out[x <= thr[1]] <- 0L
    out[x >  thr[1]] <- 1L
    out[x >  thr[2]] <- 2L
  }
  if (flag == "outside") {
    if (thr[1] > thr[2]) stop("'thr[1]' cannot be higher than 'thr[2]'.")
    out[x >= thr[1] & x <= thr[2]] <- 0L
    out[x < thr[1] | x > thr[2]] <- 2L
  }
  if (flag == "between") {
    if (thr[1] > thr[2]) stop("'thr[1]' cannot be higher than 'thr[2]'.")
    out[x <= thr[1] | x >= thr[2]] <- 0L
    out[x > thr[1] & x < thr[2]] <- 2L
  }
  if (flag == "lower") {
    if (thr[1] < thr[2]) stop("'thr[1]' cannot be lower than 'thr[2]'.")
    out[x >= thr[1]] <- 0L
    out[x <  thr[1]] <- 1L
    out[x <  thr[2]] <- 2L
  }
  attributes(out) <- list(thr = thr, flag = flag, additive = FALSE, na_as = NA)
  return(out)
}

## =============================================================================
#' Flag Runs of Equal Values
#'
#' Identify and flag values of runs with repeating values in a vector.
#'
#' \code{NA} values are omitted before evaluation of runs. Thus \code{NA}s do
#' not interrupt runs. Flagging is done according to the 0 - 2 quality control
#' flag scheme.
#'
#' @param x A numeric atomic type with NULL dimensions.
#' @param length A numeric value.
#' @param ignore
#'
#' @details
#' This is a modified version of the function of the same name in the R package
#' "openeddy". Primary modifications: added support for ignoring runs of value
#' 0.
#'
#' @return An integer vector with the same length as \code{x}. Its
#'   \code{varnames} and \code{units} attributes are set to  \code{name_out} and
#'   \code{"-"} values, respectively.
#'
flag_runs <- function(x, length = 2, ignore = NULL) {
  # matrix and array is numeric - we do not want them:
  check_input(x, c("numeric", "no_dims"))
  check_input(length, "numeric_val")
  out <- rep(NA, length(x))
  not_NA <- !is.na(x)
  x <- x[not_NA]
  if (!is.null(ignore)) {
    x <- x[x != ignore]
  }
  rl <- rle(x)$lengths
  keep <- rl >= length
  run_start <- (cumsum(rl) + 1 - rl)[keep]
  run_end <- cumsum(rl)[keep]
  df <- data.frame(run_start, run_end)
  runs <- unlist(apply(df, 1, function(x) x[1]:x[2]))
  out[not_NA] <- 0L
  out[not_NA][runs] <- 2L
  attributes(out) <- list(
    length = length, ignore = ignore, additive = FALSE, na_as = NA
  )
  out
}

## =============================================================================
#' Extract Quality Control Information from Coded Values
#'
#' This function is called by \code{\link{extract_QC}} and is not inteded to be
#' used directly. QC information for multiple variables stored in a character
#' vector of certain properties is extracted and interpreted.
#'
#' Each element of \code{x} is expected to have the same structure according to
#' the format specified by \code{units} attribute of \code{x}. \code{units}
#' format string (e.g. \code{"8u/v/w/ts/h2o/co2"}) starts with a prefix
#' (\code{"8"}) that is followed by variable names (\code{"u", "v", "w", "ts",
#' "h2o", "co2"}) that are distinguished by a separator (\code{"/"}). Elements
#' of \code{x} thus consist of either seven components (prefix and six flags for
#' each variable) or one \code{NA} value. Flags can have three values:
#' \itemize{
#' \item Flag 0: measured variable passed the test.
#' \item Flag 1: measured variable failed the test.
#' \item Flag 9: flag could not be obtained for the given variable (\code{NA}).
#' }
#'
#' \code{x} is interpreted in respect to instruments required to measure given
#' fluxes. SA is required for measurements of all fluxes and solely provides
#' data for computation of Tau and H fluxes. IRGA is additionally required for
#' measurements of LE and NEE. To confirm the correct performance of SA, all
#' variables \code{"u", "v", "w", "ts"} must have flag 0.
#' In the case of IRGA, all variables \code{"h2o", "co2"} must have flag 0.
#' Results are reported according to the QC scheme using QC flag range 0 - 2..
#'
#' @return A data frame with columns \code{"sa"} and \code{"irga75"}. Each
#'   column has attributes \code{"varnames"} and \code{"units"} and length equal
#'   to that of \code{x}.
#'
#' @param x An atomic type.
#' @param prefix Character string containing a \code{\link{regular expression}}
#'   identifying the prefix of coded values.
#' @param split Character string containing a \code{\link{regular expression}}
#'   identifying the separator of variable names in \code{units} attribute.
#'
#' @keywords internal
extract_coded <- function(data, var, qc_suffix, prefix = "[8]", split = "[/]") {
  x <- data[, var]
  check_input(x, "atomic")
  units <- tidyflux::units(x)
  vars <- unlist(strsplit(gsub(prefix, "", units), split = split))
  req_vars <- c("u", "v", "w", "ts", "h2o", "co2", "ch4")
  if (!all(req_vars %in% vars)) {
    stop(paste(
      "Coded variables in units of coded vector are missing:",
      paste0(req_vars[!(req_vars %in% vars)], collapse = ", "), "."
    ), call. = FALSE)
  }
  # Helper function - controls column splitting
  separate <- function(x, ...) {
    strsplit(as.character(gsub(..., "", x)), NULL)
  }
  l <- lapply(x, separate, prefix)
  # Helper function - controls NAs in splitted columns
  # 9 is internal code for NA. In case of missing record inserts NAs according
  # to number of variables in 'units'
  handle_na <- function(x, variables) {
    out <- as.numeric(unlist(x))
    out[out == 9] <- NA
    length(out) <- length(variables)
    return(out)
  }
  out <- as.data.frame(t(sapply(l, handle_na, vars)))
  names(out) <- vars
  if ("none" %in% names(out)) out$none <- NULL
  #out$sa <- apply(out[c("u", "v", "w", "ts")], 1, max)
  #out$irga75 <- apply(out[c("h2o", "co2")], 1, max)
  # values above 0 are interpreted as flag 2 for given variable
  out <- out %>% dplyr::mutate_all(dplyr::funs(dplyr::if_else(. == 1, 2L, 0L)))
  for (i in seq_len(ncol(out))) {
    colnames(out)[i] <- paste0("qc_", colnames(out[i]), "_", qc_suffix)
    attributes(out[, i]) <- list(additive = FALSE, na_as = NA)
  }
  return(out)
}

## =============================================================================
#' Extract Quality Control Information
#'
#' QC information stored in columns of data frame \code{x} is extracted and
#' interpreted in respect to instruments or individual fluxes.
#'
#' The data frame \code{x} is expected to have certain properties. It is
#' required that it contains column names according to considered QC checks. See
#' 'Arguments' above. Attribute \code{units} is required for columns
#' \code{"absolute_limits_hf" and "spikes_hf"} to extract the coded QC
#' information. See \code{\link{extract_coded}}.
#'
#' Extracted QC information can be relevant to fluxes measured by given
#' instrument(s), specific flux or is applicable to all fluxes. See 'Naming
#' Strategy' below. Results are reported according to the QC scheme using QC
#' flag range 0 - 2. In cases when extracted variable is checked against
#' thresholds (missfrac, scf, wresid), \code{\link{flag_thr}} is used to assign
#' flag values. First value of \code{missfrac_thr}, \code{scf_thr},
#' \code{w_unrot_thr} or \code{w_rot_thr} sets threshold for flag 1, second
#' value sets threshold for flag 2. If both threshold values are the same, only
#' flag 0 and 2 will be resolved (hard flag).
#'
#' Check of missing data in averaging period (missfrac) takes into account
#' number of valid records used for given averaging period. This number is
#' further reduced by the sum of count of high frequency data spikes out of
#' variables needed to compute covariance. Covariance pairs are w, u (Tau); w,
#' ts (H); w, h2o (LE) and w, co2 (NEE).
#'
#' @section Extracted QC Checks:
#' \itemize{
#' \item Check of plausibility limits (abslim). Test is not additive.
#' \item Check of high frequency data spike percentage in averaging period
#'   against thresholds (spikeshf). Test is not additive.
#' \item Check of missing data in averaging period against thresholds
#'   (missfrac). Test is not additive.
#' \item Check of spectral correction factor against thresholds (scf). Test is
#'   not additive.
#' \item Check of mean unrotated w (double rotation) or w residual (planar fit)
#'   against thresholds (wresid). Additive test.
#' }
#'
#' @section Content and format of columns:
#' \itemize{
#' \item \code{"absolute_limits_hf"}: hard flags (passed or failed the test) for
#'   individual variables for absolute limits. Limits for each variable are set
#'   in the post-processing software which also reports the resulting flags in a
#'   coded format. See \code{\link{extract_coded}}.
#' \item \code{"spikes_hf"}: hard flags (passed or failed the test) for
#'   individual variables for spike test. Threshold for maximum allowed
#'   percentage of spikes within averaging period for all variables is set in
#'   the post-processing software which also reports the resulting flags in a
#'   coded format. See \code{\link{extract_coded}}.
#' \item \code{"file_records"}: number of valid records found in the raw file
#' \item \code{"used_records"}: number of valid records used at given averaging
#'   period. This number can also contain high frequency spikes that should be
#'   excluded.
#' \item \code{"u_spikes", ts_spikes", "h2o_spikes" and "co2_spikes"}: number of
#'   high frequency spikes detected at given averaging period in respective
#'   variable. Values can be set to 0 if \code{"used_records"} already accounts
#'   for spikes.
#' \item \code{"Tau_scf", "H_scf", "LE_scf" and "co2_scf"}: spectral correction
#'   factor for given flux. Values are above 1.
#' \item \code{"w_unrot", "w_rot"}: unrotated and rotated w wind component,
#'   respectively (should be close to 0).
#' }
#'
#' @references
#'   Foken, T., Wichura, B., 1996. Tools for quality
#'   assessment of surface-based flux measurements. Agric. For. Meteorol. 78,
#'   83–105. doi:10.1016/0168-1923(95)02248-1
#'
#'   Mauder, M., Cuntz, M., Drue, C., Graf, A., Rebmann, C., Schmid, H.P.,
#'   Schmidt, M., Steinbrecher, R., 2013. A strategy for quality and uncertainty
#'   assessment of long-term eddy-covariance measurements. Agric. For. Meteorol.
#'   169, 122-135. doi:10.1016/j.agrformet.2012.09.006
#'
#' @return A data frame. Each column has attributes \code{"varnames"} and
#'   \code{"units"} .
#'
#' @param x A data frame with column names representing required variables.
#' @param abslim A logical value. Determines whether plausibility limits check
#'   should be considered. If \code{abslim = TRUE}, column
#'   \code{"absolute_limits_hf"} is required in \code{x}.
#' @param spikesHF A logical value. Determines whether check of spike percentage
#'   in averaging period should be considered. If \code{spikesHF = TRUE}, column
#'   \code{"spikes_hf"} is required in \code{x}.
#' @param missfrac A logical value. Determines whether check of missing data in
#'   averaging period against thresholds should be done.
#' @param scf A logical value. Determines whether check of spectral correction
#'   factor against thresholds should be done. If \code{scf = TRUE}, columns
#'   \code{"tau_scf", "h_scf", "le_scf" and "co2_scf"} are required in \code{x}.
#' @param wresid A logical value. Determines whether check of mean unrotated w
#'   and w residual after planar fit against thresholds should be done. If
#'   \code{wresid = TRUE}, columns \code{"w_unrot"} and \code{"w_rot"} are
#'   required in \code{x}.
#' @param rotation A character string. Specifies the type of coordinate rotation
#'   applied. Allowed values are "double" and "planar fit". Can be abbreviated.
#' @param prefix Character string containing a \code{\link{regular expression}}
#'   identifying the prefix of coded values. See \code{\link{extract_coded}}.
#' @param split Character string containing a \code{\link{regular expression}}
#'   identifying the separator of variable names in \code{units} attribute. See
#'   \code{\link{extract_coded}}.
#' @param missfrac_thr A numeric vector with 2 non-missing values. Represents
#'   thresholds (allowed fraction of missing high frequency data within
#'   averaging period) used if \code{missfrac = TRUE}.
#' @param scf_thr A numeric vector with 2 non-missing values. Represents
#'   thresholds used if \code{scf = TRUE}.
#' @param w_unrot_thr A numeric vector with 2 non-missing values. Represents
#'   thresholds for unrotated w used if \code{wresid = TRUE}.
#' @param w_rot_thr A numeric vector with 2 non-missing values. Represents
#'   thresholds for rotated w used if \code{wresid = TRUE}.
#'
extract_qc <- function(data, abslim = TRUE, spikeshf = TRUE, missfrac = TRUE,
                       sk = TRUE, scf = TRUE, wresid = TRUE, signal = TRUE,
                       precip = TRUE, rotation = c("double", "planar"),
                       prefix = "[8]", split = "[/]",
                       missfrac_thr = c(0.1, 0.1), scf_thr = c(2, 3),
                       wunrot_thr = c(0.35, 0.35), wrot_thr = c(0.1, 0.15),
                       rssi75_thr = c(90, 70), rssi77_thr = c(15, 10),
                       precip_thr = c(0, 0)) {
  # Basic check of input =======================================================
  data_names <- colnames(data)
  check_input(data, "data_frame")
  req_vars <- character()
  if (abslim) req_vars <- c(req_vars, "absolute_limits_hf")
  if (spikeshf) req_vars <- c(req_vars, "spikes_hf")
  if (missfrac) {
    req_vars <- c(
      req_vars, "file_records", "used_records", "u_sphf", "w_sphf",
      "ts_sphf", "co2_sphf", "h2o_sphf", "ch4_sphf"
    )
  }
  if (scf) {
    req_vars <- c(req_vars, "tau_scf", "h_scf", "le_scf", "co2_scf", "ch4_scf")
  }
  if (wresid) req_vars <- c(req_vars, "wunrot", "wrot")
  if (signal) req_vars <- c(req_vars, "rssi75", "rssi77")
  if (precip) req_vars <- c(req_vars, "p")
  if (length(req_vars) > 0 && !all(req_vars %in% data_names)) {
    stop(paste(
      "Missing",
      paste0(req_vars[!(req_vars %in% data_names)], collapse = ", "), "."
    ), call. = FALSE)
  }
  units <- tidyflux::units(data, names = TRUE)
  if (abslim && units["absolute_limits_hf"] == "-") {
    stop(
      "Missing 'absolute_limits_hf' format in its units attribute.",
      call. = FALSE
    )
  }
  if (spikeshf && units["spikes_hf"] == "-") {
    stop("Missing 'spikes_hf' format in its units attribute.", call. = FALSE)
  }
  if (sk && units["skewness_kurtosis_hf"] == "-") {
    stop(
      "Missing 'skewness_kurtosis_hf' format in its units attribute.",
      call. = FALSE
    )
  }
  # Create output dataframe for saving qc flags
  out <- data[, 0]

  # Extract any qc flags that already exist in input dataset
  qc_names <- stringr::str_subset(names(data), "qc")
  qc_df <- out[, 0]
  qc_df[, qc_names] <- data[, qc_names]
  for (i in seq_len(ncol(qc_df))) {
    attributes(qc_df[, i]) <- list(additive = FALSE, na_as = NA)
  }
  out <- dplyr::bind_cols(out, qc_df)

  # abslim creates individual flags based on EddyPro absolute_limits_hf column =
  if (abslim) {
    abslim_df <- extract_coded(
      data, "absolute_limits_hf", "abslimhf", prefix, split
    )
    out <- dplyr::bind_cols(out, abslim_df)
  }

  # sk creates individual flags based on EddyPro skewness_kurtosis_hf column ===
  if (sk) {
    sk_df <- extract_coded(data, "skewness_kurtosis_hf", "sk", prefix, split)
    out <- dplyr::bind_cols(out, sk_df)
  }

  # spikeshf creates individual flags based on EddyPro spikes_hf column ========
  if (spikeshf) {
    sphf_df <- extract_coded(data, "spikes_hf", "sphf", prefix, split)
    out <- dplyr::bind_cols(out, sphf_df)
    # TODO: >100 spikes filtering (Mammarella2014)
  }

  # missfrac creates flag for records with too few high frequency readings =====
  if (missfrac) {
    # Flag according to argument 'missfrac_thr' (missing fraction thresholds):
    # missfrac > missfrac_thr[1]: flag = 1, missfrac > missfrac_thr[2]: flag = 2
    if (!all(is.na(data$file_records))) {
      ur <- data$used_records
      mfr <- max(data$file_records, na.rm = TRUE)  # maximum file records (mfr)
      # mf computes missing fraction for particular flux
      mff <- function(data, ur, mfr) {
        1 - (ur - apply(data, 1, sum, na.rm = TRUE)) / mfr
      }
      mf_df <- out[, 0]
      mf_vars <- c("u", "v", "w", "ts", "h2o", "co2", "ch4")
      for (i in seq_along(mf_vars)) {
        sphf <- paste0(mf_vars[i], "_sphf")
        mf <- mff(data[sphf], ur, mfr)
        name <- paste0("qc_", mf_vars[i], "_mf")
        mf_df[, name] <- flag_thr(mf, missfrac_thr)
      }
      out <- dplyr::bind_cols(out, mf_df)
    } else {
      warning("'data$file_records' is all NA, skipped missfrac.", call. = FALSE)
    }
  }

  # scf creates flag for records with excessive spectral correction ============
  if (scf) {
    # Flag according to argument 'scf_thr' (spectral correction factor
    # thresholds): scf > scf_thr[1]: flag = 1, scf > scf_thr[2]: flag = 2
    nin <- c("tau_scf", "h_scf", "le_scf", "co2_scf", "ch4_scf")
    nout <- c("qc_tau_scf", "qc_h_scf", "qc_le_scf", "qc_nee_scf", "qc_ch4_scf")
    scf_df <- out[, 0]
    for (i in seq_along(nout)) {
      scf_df[, nout[i]] <- flag_thr(data[, nin[i]], scf_thr)
    }
    out <- dplyr::bind_cols(out, scf_df)
  }

  # wresid creates overall flag for records with probable advection ============
  if (wresid) {
    rotation <- match.arg(rotation)
    message(paste("wresid:", rotation, "rotation: using",
                  ifelse(rotation == "double", "wunrot_thr", "wrot_thr")))
    if (rotation == "double") {
      # In case of double rotation abs(wunrot) should be < 0.35 m/s
      out$qc_w_resid <- flag_thr(abs(data$wunrot), wunrot_thr)
    } else {
      # Flag correction according to residual absolute w after planar fit
      # abs(w) > 0.10 m s-1: flag incresead by +1, abs(w) > 0.15 m s-1: flag = 2
      out$qc_w_resid <- flag_thr(abs(data$wrot), wrot_thr)
    }
  }

  # signal creates flag for records below sensor stability limits ==============
  if (signal) {
    out$qc_irga75_ss <- flag_thr(data$rssi75, rssi75_thr, "lower")
    out$qc_irga77_ss <- flag_thr(data$rssi77, rssi77_thr, "lower")
  }

  # precip creates overall flag for records with rain interference =============
  if (precip) {
    out$qc_all_p <- flag_thr(data$p, precip_thr)
    # temporary fix to flag p events AND records immediately after p events
    out$qc_all_p_lead <- flag_thr(dplyr::lag(data$p), precip_thr)
    attr(out$qc_all_p_lead, "note") <- "flagged if lag(qc_all_p) = 2"
  }

  return(out)
}

## =============================================================================
#' Apply Flux Interdependency Quality Control Scheme
#'
#' Interdependency of H, LE and NEE QC flags due to corrections/conversions.
#'
#' Flux interdependency is an additive QC flag correction. Results follow the QC
#' scheme using QC flag range 0 - 2. Returned data frame follows the 'Naming
#' Strategy' described in \code{\link{extract_QC}}. Returned QC flags have QC
#' suffix interdep.
#'
#' To convert buoyancy flux to sensible heat flux (SND or Schotanus correction),
#' reliable measurements of LE must be available. To correct LE and NEE
#' estimated by open-path IRGA (\code{IRGA = "open"}) for the effects of density
#' fluctuations due to temperature and humidity fluctuations (WPL or Webb
#' correction), reliable measurements of H must be available. To perform WPL
#' correction of NEE estimated with any kind of IRGA, reliable measurements of
#' LE must be available. Thus following set of rules apply:
#'
#' If \code{IRGA = "en_closed"} \itemize{ \item If \code{qc_LE == 2 |
#' is.na(qc_LE) == TRUE}: qc_H and qc_NEE flags are increased by 1.} If
#' \code{IRGA = "open"} \itemize{ \item if \code{qc_LE == 2 | is.na(qc_LE) ==
#' TRUE}: qc_H flags are increased by 1. \item If \code{qc_H == 2 | is.na(qc_H)
#' == TRUE}: qc_LE flags are increased by 1. \item If \code{qc_H == 2 |
#' is.na(qc_H) == TRUE | qc_LE == 2 | is.na(qc_LE) == TRUE}: qc_NEE flags are
#' increased by 1.}
#'
#' @section References: Mauder, M., Cuntz, M., Drue, C., Graf, A., Rebmann, C.,
#'   Schmid, H.P., Schmidt, M., Steinbrecher, R., 2013. A strategy for quality
#'   and uncertainty assessment of long-term eddy-covariance measurements.
#'   Agric. For. Meteorol. 169, 122-135. doi:10.1016/j.agrformet.2012.09.006
#'
#' @return A data frame. Each column has attributes \code{"varnames"} and
#'   \code{"units"}.
#'
#' @param qc_le An atomic type containing numeric or NA values. Combination of
#'   all available QC flags for LE.
#' @param qc_h An atomic type containing numeric or NA values. Combination of
#'   all available QC flags for H. Used only if \code{irga = "open"}.
#' @param irga A character string specifying the type of irga. Allowed values
#'   are \code{"en_closed"} both for closed and enclosed path systems and
#'   \code{"open"} for open path systems. Can be abbreviated.
#'
interdep <- function(data, qc_le, qc_h = NULL, irga = c("open", "en_closed")) {
  qc_le <- data[, qc_le]
  if (!is.null(qc_h)) qc_h <- data[, qc_h]
  check_input(qc_le, c("atomic", "numeric_na"))
  check_input(qc_h, "atomic")
  irga <- match.arg(irga)
  if (irga == "open") {
    check_input(qc_h, "numeric_na")
  }
  len <- length(qc_le)
  if (irga == "open" && len != length(qc_h)) {
    stop("length(qc_le) and length(qc_h) must be equal.")
  }
  # qc_h is influencing variable only for open path system
  nout <- c("qc_h_inter", "qc_le_inter", "qc_fco2_inter")
  out <- data.frame(rep(NA, len), rep(NA, len), rep(NA, len))
  names(out) <- nout
  for (i in nout) {
    attributes(out[, i]) <- list(additive = TRUE, na_as = NA)
  }
  if (length(qc_le) == 0) {
    if (irga != "open") out$qc_le_inter <- NULL
    return(out)
  }
  qc_le[is.na(qc_le)] <- 2L
  out$qc_h_inter[qc_le <  2] <- 0L
  out$qc_h_inter[qc_le >= 2] <- 1L
  if (irga == "open") {
    qc_h[is.na(qc_h)] <- 2L
    out$qc_le_inter[qc_h <  2] <- 0L
    out$qc_le_inter[qc_h >= 2] <- 1L
    out$qc_fco2_inter[qc_le <  2 & qc_h <  2] <- 0L
    out$qc_fco2_inter[qc_le >= 2 | qc_h >= 2] <- 1L
  } else {
    out$qc_le_inter <- NULL
    out$qc_fco2_inter[qc_le <  2] <- 0L
    out$qc_fco2_inter[qc_le >= 2] <- 1L
  }
  return(out)
}

## =============================================================================
#' Apply despiking to a given subset
#'
#' This is a low level function not intended to be used on its own. It is
#' utilized by \code{\link{despike}} that should be used instead.
#'
#' @param block_1 The double-differenced \code{var} time series with appropriate
#'   block size.
#' @param block_2 Serves for computing despiking threshold and can be from the
#'   same block as \code{block_1} (default) or from the whole year (more
#'   conservative when less data available).
#' @param ref_block_1 \code{var} values from appropriate block for finding
#'   false-flagged spikes by comparison with scaled median absolute deviation.
#' @param ref_block_2 \code{var} values for computing median and mad from the
#'   same block as \code{ref_block_1} (default) or whole year (more conservative
#'   when less data available).
#' @param z A numeric value. \eqn{MAD} scale factor.
#' @param c A numeric value. \code{\link{mad}} scale factor. Default is \code{3
#'   * \link{mad} constant} (\code{i.e. 3 * 1.4826 = 4.4478}).
#'
#' @return A logical vector. \code{TRUE} values flag spikes.
#'
#' @keywords internal
desp <- function(b1, b2 = b1, rb1, rb2 = rb1, z, c) {
  mb <- median(b2, na.rm = TRUE) # med block
  br <- z * median(abs(b2 - mb), na.rm = TRUE) / 0.6745 # bracket
  md <- median(rb2, na.rm = TRUE) # med ref
  mad <- mad(rb2, constant = c, na.rm = TRUE) # mad ref
  out <- list(
    spike = (
      (b1 < (mb - br) | b1 > (mb + br)) & (rb1 > (md + mad) | rb1 < (md - mad))
    ),
    spike_all = (b1 < (mb - br) | b1 > (mb + br)),
    stats = data.frame(
      med_block = mb, scaled_mad = br,
      med_ref = md, mad_ref = mad
    )
  )
  is.na(out$spike) <- is.na(b1) # cannot check for spikes if d[i] missing
  # this happens for first and last value; has to be treated because of "&"
  return(out)
}

## =============================================================================
#' Apply despiking to all data blocks
#'
#' This is a low level function not intended to be used on its own. It is
#' utilized by \code{\link{despike}} that should be used instead.
#'
#' @param sd_sub A data frame prepared by \code{\link{despike}} with expected
#'   columns index, date, timestamp, var, spike and light This is a subset of
#'   data (\code{x}) provided to \code{\link{despike}} containing only the
#'   data of good quality.
#' @param date An unsubsetted vector of class \code{"Date"} extracted from data
#'   frame \code{x} fed to \code{\link{despike}}.
#' @param n A numeric value. Number of values within 13-day blocks required to
#'   obtain robust statistics.
#' @param z A numeric value. \eqn{MAD} scale factor.
#' @param c A numeric value. \code{\link{mad}} scale factor. Default is \code{3
#'   * \link{mad} constant} (\code{i.e. 3 * 1.4826 = 4.4478}).
#' @param plot A logical value. If \code{TRUE}, list of \code{\link{ggplot}}
#'   objects visualizing the spikes is also produced.
#'
#' @return If \code{plot = FALSE}, an updated data frame \code{sd_sub} with
#'   identified spikes in column Spike and with three new columns var_minus,
#'   var_plus and diff. If \code{plot = TRUE}, a list with elements \code{SD} (a
#'   data frame identical to the one produced if \code{plot = FALSE}) and
#'   \code{plots} containing a list of \code{ggplot} objects.
#'
#' @keywords internal
desp_loop <- function(sd_sub, date, n, z, c, plot = FALSE) {
  sd_sub$var_minus <- c(NA, sd_sub$var[-nrow(sd_sub)])
  sd_sub$var_plus <- c(sd_sub$var[-1], NA)
  sd_sub$diff <- with(sd_sub, (var - var_minus) - (var_plus - var))
  if (sum(!is.na(sd_sub$diff)) <= n) {
    stop("Number of values in subset is below n.", call. = FALSE)
  }
  sd_date <- date[1]
  if (plot) {
    i <- 1L
    plots <- list()
  }
  while (sd_date <= date[length(date)]) {
    # 13 day block filter
    block <- sd_sub$date >= sd_date & sd_sub$date < sd_date + 13
    if (sum(block, na.rm = TRUE) > n) {
      desp_res <- desp(
        b1 = sd_sub$diff[block],
        rb1 = sd_sub$var[block], z = z, c = c
      )
      sd_sub$spike[block] <- desp_res$spike
    } else if (sum(block, na.rm = TRUE) == 0) {
      sd_date <- sd_date + 13
      next
    } else {
      desp_res <- desp(
        b1 = sd_sub$diff[block], b2 = sd_sub$diff,
        rb1 = sd_sub$var[block], rb2 = sd_sub$var, z = z, c = c
      )
      sd_sub$spike[block] <- desp_res$spike
    }
    if (plot) {
      x <- na.omit(sd_sub[block, c("timestamp", "diff", "var")])
      x <- reshape2::melt(x, id = "timestamp")
      ts_range <- range(sd_sub[block, "timestamp"])
      y_range_d <- with(desp_res$stats, med_block + c(-scaled_mad, scaled_mad))
      y_range_f <- with(desp_res$stats, med_ref + c(-mad_ref, mad_ref))
      d <- data.frame(
        variable = levels(x$variable),
        yintercept = c(desp_res$stats$med_block, desp_res$stats$med_ref),
        xmin = rep(ts_range[1], 2),
        xmax = rep(ts_range[2], 2),
        ymin = c(y_range_d[1], y_range_f[1]),
        ymax = c(y_range_d[2], y_range_f[2])
      )
      xs1 <- sd_sub[block, ][desp_res$spike_all, c("timestamp", "diff")]
      xs2 <- sd_sub[block, ][desp_res$spike, c("timestamp", "var")]
      xs <- na.omit(
        reshape2::melt(merge(xs1, xs2, all = TRUE), id = "timestamp")
      )
      plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(timestamp, value)) +
        ggplot2::geom_point() + ggplot2::geom_line() +
        ggplot2::facet_grid(variable ~ ., scales = "free_y") +
        ggplot2::ggtitle(paste(sd_date, "-", sd_date + 12)) +
        ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
        ggplot2::geom_rect(data = d, ggplot2::aes(xmin = xmin, xmax = xmax,
                                                  ymin = ymin, ymax = ymax),
                  inherit.aes = FALSE, alpha = 1/5) +
        ggplot2::geom_hline(data = d, ggplot2::aes(yintercept = yintercept)) +
        ggplot2::geom_point(data = xs, ggplot2::aes(x = timestamp, y = value,
                                                    colour = variable))
      names(plots)[i] <- paste(sd_date, "-", sd_date + 12)
      i <- i + 1L
    }
    sd_date <- sd_date + 13
  }
  if (plot) return(list(sd = sd_sub, plots = plots))
  return(sd_sub)
}

## =============================================================================
#' Low Frequency Data Despiking
#'
#' Scaled median absolute deviation from the median is applied to
#' double-differenced time series to identify outliers.
#'
#' Low Frequency Data Despiking is not an additive quality control (QC) test.
#' \code{despike} follows the QC scheme using QC flag range 0 - 2.
#' \code{varnames} attribute of returned vector should be chosen to follow the
#' 'Naming Strategy' described in \code{\link{extract_QC}}, i.e. to be
#' distinguished by suffix \code{"_spikesLF"}.
#'
#' The data frame \code{x} is expected to have certain properties. It is
#' required that it contains column named \code{"timestamp"} of class
#' \code{"POSIXt"} with regular sequence of date-time values, typically with
#' (half-)hourly frequency. Any missing values in \code{"timestamp"} are not
#' allowed. Thus, if no records exist for given date-time value, it still has to
#' be included. It also has to contain required (depends on the argument values)
#' column names. If QC flags are not available for \code{var}, \code{qc_flag}
#' still has to be included in \code{x} as a named column with all values set to
#' \code{0} (i.e. all values will be checked for outliers).
#'
#' Only non-missing \code{var} values with corresponding \code{qc_flag} values
#' below \code{2} are used to detect the outliers. Missing \code{var} values or
#' those with assigned flag \code{2} or \code{NA} are not checked and marked by
#' \code{NA} flag in the output. Thus \code{NA} values of \code{despike} should
#' be considered as not checked records and therefore interpreted as \code{0}
#' flag within the \code{0 - 2} quality control scheme.
#'
#' \code{var_thr} is intended for exclusion of data clearly outside of
#' theoretically acceptable range for the whole dataset. If \code{var_thr} is
#' specified, \code{var} values below \code{var_thr[1]} and above
#' \code{var_thr[2]} are marked as spikes (flag 2) in the output. Such values
#' are further not used for computing statistics on double-differenced time
#' series.
#'
#' \code{light} and \code{night_thr} are intended to separate data to night and
#' day subsets with different statistical properties. \code{NA}s in
#' \code{x[light]} are thus not allowed due to the subsetting. Despiking is then
#' applied to individual subsets and combined QC flags are returned.
#'
#' Despiking is done within blocks of 13 consecutive days to account for
#' seasonality of measured variable. Within each block, all records are compared
#' with its neighbours and \eqn{d[i]} scores are produced. This is achieved by
#' double-differencing: \deqn{d[i] = (var[i] - var[i-1]) - (var[i+1] - var[i])}
#' In order to obtain maximum amount of \eqn{d[i]} scores, all missing
#' \code{var} values are removed from the block before \eqn{d[i]} scores are
#' produced. \code{var} values are marked as spikes if \eqn{d[i]} is higher
#' (lower) than median of \eqn{d[i]} scores (\eqn{M[d]}) + (-) scaled median
#' absolute deviation: \deqn{d[i] > M[d] + (z * MAD / 0.6745)} \deqn{d[i] < M[d]
#' - (z * MAD / 0.6745)} MAD is defined as: \deqn{MAD = median(abs(d[i] -
#' M[d]))}
#'
#' The algorithm tends to flag also values that are neighbours of spikes. To
#' prevent false flagging, \code{\link{median}} and \code{\link{mad}} of
#' \code{var} values within given block (\eqn{M[var]} and \eqn{mad[var]},
#' respectively) is computed. Values can be marked as spikes only if
#' \deqn{var[i] > M[var] + (c * mad / 1.4826)} or \deqn{var[i] < M[var] - (c *
#' mad / 1.4826)}
#'
#' Number of available double-differenced \code{var} values (\code{nVals}) is
#' checked within each block. If equal or below \code{nVals}, \eqn{d[i]} or
#' \eqn{var[i]} values are checked against the statistics computed using entire
#' dataset to ensure robustness.
#'
#' The whole process is repeated iteratively if \code{iter > 1}. This way new
#' statistics are produced for each iteration after exclusion of already
#' detected outliers and new spikes can be identified.
#'
#' @section Plotting: Plots are produced as a list of \code{ggplot} objects.
#'   Thus they can be assigned to an object and modified as needed before actual
#'   plotting. Each plot consists of two panels. The upper one shows the
#'   double-differenced time series, the bottom one the actual \code{var}
#'   values. Grey bands mark the respective intervals in which \code{var} value
#'   cannot be considered as an outlier. The red points in upper panel show all
#'   points that would be marked as spikes if \code{c = 0}. Only the points
#'   marked by blue color (bottom panel) will be considered spikes. The spike
#'   detection tolerance (width of grey bands) can be modified by scale factors
#'   \code{z} (upper panel) and \code{c} (bottom panel).
#'
#' @section References: Mauder, M., Cuntz, M., Drue, C., Graf, A., Rebmann, C.,
#'   Schmid, H.P., Schmidt, M., Steinbrecher, R., 2013. A strategy for quality
#'   and uncertainty assessment of long-term eddy-covariance measurements.
#'   Agric. For. Meteorol. 169, 122-135. doi:10.1016/j.agrformet.2012.09.006
#'
#'   Papale, D., Reichstein, M., Canfora, E., Aubinet, M., Bernhofer, C.,
#'   Longdoz, B., Kutsch, W., Rambal, S., Valentini, R., Vesala, T., Yakir, D.,
#'   2006. Towards a more harmonized processing of eddy covariance CO2 fluxes:
#'   algorithms and uncertainty estimation. Biogeosciences Discuss. 3, 961-992.
#'   doi:10.5194/bgd-3-961-2006
#'
#' @return If \code{plot = FALSE}, an integer vector with attributes
#'   \code{"varnames"} and \code{"units"}. If \code{plot = TRUE}, a list with
#'   elements \code{SD} and \code{plots}. \code{SD} contains identical output as
#'   produced when \code{plot = FALSE}, \code{plots} contains list of
#'   \code{ggplot} objects for respective iteration, \code{light} subset and
#'   13-day period.
#'
#'   Side effect: the counts of spikes detected in each iteration are printed to
#'   console.
#'
#' @param x A data frame with column names representing required variables. See
#'   'Details' below.
#' @param var A character string. Specifies the variable name in \code{x} with
#'   values to be despiked.
#' @param qc_flag A character string. Specifies the column name in \code{x} with
#'   \code{var} related quality control flag.
#' @param var_thr A numeric vector with 2 non-missing values. Specifies fixed
#'   thresholds for \code{var} values. Values outside this range will be flagged
#'   as spikes (flag 2). If \code{var_thr = NULL}, thresholds are not applied.
#' @param iter An integer value. Defines number of despiking iterations.
#' @param plot A logical value. If \code{TRUE}, list of \code{\link{ggplot}}
#'   objects visualizing the spikes is also produced.
#' @param light A character string. Selects preferred variable for incoming
#'   light intensity. \code{"PAR"} or \code{"Rg"} is allowed. Can be
#'   abbreviated. If \code{light = NULL}, \code{var} values are not separated to
#'   night/day subsets and \code{night_thr} is not used.
#' @param night_thr A numeric value that defines the threshold between night
#'   (for \code{light} values equal or lower than \code{night_thr}) and day (for
#'   \code{light} values higher than \code{night_thr}) for incoming light.
#' @param n A numeric value. Number of values within 13-day blocks required to
#'   obtain robust statistics.
#' @param z A numeric value. \eqn{MAD} scale factor.
#' @param c A numeric value. \code{\link{mad}} scale factor. Default is \code{3
#'   * \link{mad} constant} (\code{i.e. 3 * 1.4826 = 4.4478}).
#'
#' @export
despike <- function(x, timestamp, qc_flag = NULL, var_thr = NULL,
                    only_thr = FALSE, iter = 10, plot = FALSE,
                    light = rg, night_thr = 12, n = 50, z = 5, c = 4.4478) {
  # Check inputs
  check_input(x, c("atomic", "numeric_na"))
  if (is.null(qc_flag)) qc_flag <- rep(0, length(x))
  check_input(qc_flag, c("atomic", "numeric_na"))
  if (!is.null(light)) {
    check_input(light, c("atomic", "numeric_na"))
    check_input(night_thr, c("numeric", "length_1"))
  }
  check_input(timestamp, c("posixt", "all_finite"))
  diff <- diff(as.numeric(timestamp))
  if (any(diff != mean(diff))) {
    stop("Timestamp does not form regular sequence.")
  }
  if (iter[1] < 1) stop("Set 'iter' to 1 or higher.")
  date <- lubridate::date(timestamp)
  qc_flag[is.na(qc_flag)] <- 2L # NA qc is interpreted as flag 2
  if (!is.null(light)) {
    sun <- light
    if (anyNA(sun)) {
      stop("NAs in x['light'] not allowed. Use gap-filling then proceed.")
    }
  }
  # Filter for used data (qc not flag 2 or NA and var is not NA)
  use <- qc_flag < 2 & !is.na(x)
  # Introduce Spike flag, set a fixed threshold and update filter
  out <- rep(NA, length(x)) # Flag NA: if not changed it means not tested
  cat(paste0("Despiking ", substitute(x), "\n"))
  if (!is.null(var_thr)) {
    check_input(var_thr, c("numeric", "length_2", "all_finite"))
    if (var_thr[1] > var_thr[2]) {
      stop("'var_thr[1]' cannot be higher than 'var_thr[2]'.")
    }
    out[((x < var_thr[1]) | (x > var_thr[2])) & use] <- 2L
    n_unused <- length(out[((x < var_thr[1]) | (x > var_thr[2])) & use])
    cat(paste0("Excluded due to exceeded threshold: ", n_unused, "\n"))
    if (only_thr) {
      attributes(out) <- list(
        var_thr = var_thr, only_thr = only_thr, additive = FALSE, na_as = 0
      )
      return(out) # Stop and return if only using thresholds to detect spikes
    }
    use <- use & is.na(out) # Flag NA at this point = eligible for despiking
  } else n_unused <- 0
  # Spike Detection (Papale et al. 2006)
  # Create data frame with dates, x & sun intensity values
  sd_df <- data.frame(
    index = seq_len(length(x)), date = date,
    timestamp = timestamp, var = x, spike = out
  )
  if (!is.null(light)) {
    sd_df$light <- sun
  }
  if (plot) plots <- list()
  ts <- 0
  for (i in seq_len(iter)) {
    # Filter out all low quality data and create continuous NEE time series
    # sd_df after filtering keeps only records to be tested
    # On first iteration sd_df$spike contains only NAs (vals not checked yet)
    # During following iterations sd_df$spike:
    #  - has 2 (light = NULL) or 4 (night/day) NAs (boundaries of subsets)
    #  - otherwise equals 0 (spikes so far not detected)
    # sd_df is used to merge results from sd_l after despiking
    # sd_df$spike is a subset of out (out needs to be matched by sd_df$index)
    # length(sd_df$spike) decreases with iterations
    sd_df <- sd_df[use, ]
    sd_l <- list(all = sd_df)
    if (!is.null(light)) {
      night <- sd_l$all$light <= night_thr # filter to distinguish night/day
      sd_l <- list(night = sd_l$all[night, ], day = sd_l$all[!night, ])
    }
    # SD_l data frames have 3 more columns after despiking
    if (plot) {
      temp <- lapply(sd_l, desp_loop, date, n, z, c, plot)
      sd_l <- lapply(temp, "[[", "sd") # overwritten every iteration
      plots[[i]] <- lapply(temp, "[[", "plots") # all iteration results kept
      names(plots)[i] <- paste("iter", i)
    } else {
      sd_l <- lapply(sd_l, desp_loop, date, n, z, c, plot)
    }
    sd_l <- lapply(sd_l, function(x) {
      x$spike[x$spike == TRUE] <- 2L
      return(x)
    })
    # Export results from spike detection list into the data frame
    if (!is.null(light)) {
      sd_df$spike[match(sd_l$night$index, sd_df$index)] <- sd_l$night$spike
      sd_df$spike[match(sd_l$day$index, sd_df$index)] <- sd_l$day$spike
    } else {
      sd_df$spike <- sd_l$all$spike
    }
    # Export the results into the output vector
    out[sd_df$index] <- sd_df$spike
    # Count the number of detected spikes in this iteration and report them
    ns <- sum(sd_df$spike == 2, na.rm = TRUE)
    ts <- ts + ns
    cat(paste0("iter ", i, ": ", ns, "\n"))
    if (!ns && i < iter) {
      cat("Further iterations omitted\n")
      cat(paste0("Total records flagged: ", ts + n_unused, "\n"))
      break
    }
    # Update use filter - it has to fit the already subsetted SD_df
    # It has different length for each iteration
    use <- sd_df$spike == 0 | is.na(sd_df$spike)
  }
  if (plot) return(list(sd = out, plots = plots))
  attributes(out) <- list(
    var_thr = var_thr, only_thr = only_thr, iter = iter,
    night_thr = night_thr, n = n, z = z, c = c, additive = FALSE, na_as = 0
  )
  if (!is.null(light)) attr(out, "light") <- varnames(light)
  return(out)
}

## =============================================================================
#' Apply Fetch Filter
#'
#' \code{fetch_filter} flags all halfhours that have longer fetch distance (of
#' given percentage of contribution to the flux) than the user defined boundary
#' of the region of interest (ROI).
#'
#' Fetch distance is used together with wind direction information to identify
#' the cases when fetch reached beyond ROI.
#'
#' The spatial extent of the studied ecosystem (ROI) is specified by its
#' \code{ROI_boundary} that describes the distance from eddy covariance tower to
#' the edge of the studied ecosystem. \code{ROI_boundary} has following
#' properties: \itemize{ \item the number of circular sectors is the same as the
#' number of provided distances; \item the angular resolution of the ROI
#' boundary can be computed as 360 degrees / number of angular sectors; \item
#' the ROI boundary distances are assigned to the centers of their respective
#' circular sectors with first sector centered on 0 degrees.}
#'
#' Example: \code{ROI_boundary} specified as c(150, 200, 250, 300) has following
#' properties: \itemize{ \item 4 circular sectors with 90° angular resolution;
#' \item ROI boundary is specified for the whole first sector (315°, 45°] at the
#' distance 150 m from tower (center of the sector is 0°); \item boundary of the
#' second sector (45°, 135°] is at the distance 200 m; \item third sector (135°,
#' 225°] is at the distance 250 m; \item fourth sector (225°, 315°] is at the
#' distance 300 m.}
#'
#' @return An integer vector with attributes \code{"varnames"} and
#'   \code{"units"}.
#'
#' @param x A data frame with column names representing required variables.
#' @param fetch_name A character string. Specifies the column name in \code{x}
#'   with fetch distance values.
#' @param wd_name A character string. Specifies the column name in \code{x} with
#'   wind direction values.
#' @param ROI_boundary A numeric vector. Represents the boundary of region of
#'   interest.
#'
#' @export
flag_fetch <- function(fetch, wd, roi_boundary) {
  # Check inputs
  check_input(fetch, "numeric_na")
  check_input(wd, "numeric_na")
  check_input(roi_boundary, "numeric")
  sector_span <- 360 / length(roi_boundary)
  sector <- round(wd / sector_span) + 1
  sector[sector == (length(roi_boundary) + 1)] <- 1
  out <- rep(NA, length(x))
  out[fetch <= roi_boundary[sector]] <- 0L
  out[fetch >  roi_boundary[sector]] <- 2L
  attributes(out) <- list(
    fetch = varnames(fetch), roi_boundary = roi_boundary, additive = FALSE,
    na_as = NA
  )
  out
}

## =============================================================================
#' Check and Set Quality Flag
#'
#' Generate new vector from data and quality flag column. Adapted from fSetQF in
#' the package REddyProc.
#'
#' @param data A data frame.
#' @param var A numeric vector, the variable to be filtered.
#' @param qc_var A numeric vector, the quality flag of variable to be filtered.
#' @param qc_val The numeric value of quality flag for _good_ data, other data
#'   is set to missing.
#'
#' @details The quality flag will be applied to variable - unless the quality
#'   flag variable is set to \code{NULL}.
#'
#' @return A numeric vector with _good_ data.
#'
#' @export
apply_qc <- function(x, qc_flag = NULL, qc_val = 0) {
  # Check if specified columns exist and are numeric
  check_input(x, "numeric")
  name <- varnames(x)
  if (!is.null(qc_flag)) {
    check_input(qc_flag, "numeric_na")
    # Check quality flag value
    check_input(qc_val, "numeric")
    # Only use data values when good data (quality flag is equal to flag value)
    out <- dplyr::if_else(qc_flag > qc_val, NA_real_, x)
    if (sum(!is.na(out)) == 0) {
      warning(
        name, "' has no data after applying qc flag with value ",
        qc_val, ".", call. = FALSE
      )
    }
  } else {
    # Use all data
    out <- x
    if (sum(!is.na(out)) == 0) {
      warning(name, "' has no data.", call. = FALSE)
    }
  }
  # Add units
  attr(out, "units") <- attr(x, "units")
  attr(out, "varnames") <- dplyr::if_else(
    is.null(x), name, paste0(name, "_", "qccor")
  )
  return(out)
}

## =============================================================================
#' Summarize Quality Control Results
#'
#' \code{summary_qc} is a function that summarizes quality checking results in a
#' form of table or plot.
#'
#' \code{summary_qc} loads a data frame \code{x}, extracts quality control
#' columns from it based on \code{qc_names} and creates a table (\code{plot =
#' FALSE}) or a plot (\code{plot = TRUE}) for these columns. Results are
#' displayed in percentage (\code{perc = TRUE}) or counts (\code{perc = FALSE})
#' of given flag per dataset.
#'
#' \code{cumul = TRUE} specifies that cumulative effect of flags is considered
#' for each halfhour. Note that for \code{cumul = TRUE} the results do depend on
#' the order of qc_names. The flags on each row are cumulatively summed from
#' left to right. \code{additive} is used only if cumul = TRUE, otherwise
#' skipped.
#'
#' Adapted from: openeddy - summarize_QC
#'
#' @return A table or a plot depending on the \code{plot} argument value.
#'
#' @param x A data frame with column names.
#' @param qc_names A vector of names of data frame columns to combine.
#' @param na.as A vector of numeric or \code{NA} values for the \code{qc_names}
#'   subset of \code{x} that determines how should be the missing flags
#'   interpreted. If only one value is provided, all columns are treated the
#'   same way.
#' @param cumul A logical value that determines if cumulative (\code{cumul =
#'   TRUE}) or individual (\code{cumul = FALSE}) effects of quality control
#'   flags should be shown.
#' @param additive A vector of logical values (\code{TRUE} or \code{FALSE}) for
#'   the \code{qc_names} subset of \code{x} that determines if the flags should
#'   be treated as additive (\code{additive = TRUE}) or with fixed effect
#'   (\code{additive = FALSE}). If only one value is provided, all columns are
#'   considered to be of the same type.
#' @param plot A logical value. If \code{TRUE}, the results are plotted in the
#'   active graphical device. If \code{FALSE}, they are represented as a table.
#' @param perc A logical value. If \code{TRUE}, the results are reported in
#'   percentages. If \code{FALSE}, counts are used instead.
#' @param flux A character string. Used only if \code{plot = TRUE}. Includes the
#'   flux name in the plot title to emphasize the relevance of displayed test
#'   types.
#'
#' @export
summarize_qc <- function(x, var, na.as = NA, cumul = FALSE,
                         additive = FALSE, plot = FALSE, perc = TRUE,
                         flux = NULL) {
  # Find flags that match _var_ OR _var (incl. deps)
  x_names <- colnames(x)
  deps <- get_deps()[[var]]
  regex <- stringr::str_c("_", deps, "(_|\\b)", collapse = "|")
  qc_names <- stringr::str_subset(x_names, regex)

  # Check inputs
  check_input(x, "data_frame")
  check_input(qc_names, "character")
  check_input(na.as, c("vector", "numeric_na"))
  if (length(na.as) > 1) {
    if (length(qc_names) != length(na.as)) {
      stop("'na.as' must be of same length as 'qc_names' or length 1.")
    }
  } else if (length(na.as) != length(qc_names)) {
    na.as <- rep(na.as, length(qc_names))
  }
  if (!all(qc_names %in% x_names)) {
    stop(paste(
      "Missing",
      paste0(qc_names[!(qc_names %in% x_names)], collapse = ", "), "."
    ))
  }
  df <- x[1:nrow(x), qc_names]
  if (!all(is.na(na.as))) {
    for (i in seq_along(qc_names)) {
      df[is.na(df[i]), i] <- na.as[i]
    }
  }
  if (cumul) {
    check_input(additive, c("vector", "logical", "all_finite"))
    if (length(additive) > 1) {
      if (length(qc_names) != length(additive)) {
        stop("'additive' must be of same length as 'qc_names' or length 1.")
      }
    } else if (length(additive) != length(qc_names)) {
      additive <- rep(additive, length(qc_names))
    }
    if (all(additive)) {
      df <- as.data.frame(t(apply(df, 1, cumsum)))
    } else if (!any(additive)) {
      df <- as.data.frame(t(apply(df, 1, cummax)))
    } else {
      tmp <- df
      tmp[2:ncol(df)] <- NA
      for (i in 2:ncol(df)) {
        tmp[i] <- if (additive[i]) {
          rowSums(cbind(tmp[i - 1], df[i]))
        } else {
          apply(cbind(tmp[i - 1], df[i]), 1, max)
        }
      }
      df <- tmp
    }
  }
  df_m <- reshape2::melt(
    df, id.vars = NULL, variable.name = "type", value.name = "flag"
  )
  if (perc) {
    tab <- round(table(df_m, useNA = "ifany") / (nrow(df) / 100), 1)
  } else tab <- table(df_m, useNA = "ifany")
  if (!plot) {
    return(tab)
  } else {
    tab_m <- reshape2::melt(tab, variable.name = "type")
    if (cumul) {
      high5 <- tab_m$flag > 5
      high5[is.na(high5)] <- FALSE
      tab_m[high5, "flag"] <- "6+"
    }
    tab_m$flag <- as.factor(paste("Flag", tab_m$flag, sep = "_"))
    title <- ifelse(
      cumul,
      paste0(c(flux, "Cumulative effect of QC flags"), collapse = " - "),
      paste0(c(flux, "Independent QC flags"), collapse = " - ")
    )
    y_label <- ifelse(perc, "Percentage", "Count")
    ggplot2::ggplot(tab_m) +
      ggplot2::aes(x = type, y = value, fill = flag) +
      ggplot2::geom_bar(
        stat = "identity", position = ggplot2::position_stack(reverse = TRUE)
      ) +
      ggplot2::scale_fill_hue(guide = ggplot2::guide_legend(reverse = TRUE)) +
      ggplot2::labs(title = title, y = y_label) +
      ggplot2::theme(
        plot.title = ggplot2::element_text(hjust = 0.5),
        axis.text.x = ggplot2::element_text(angle = 20, vjust = 1, hjust = 1)
      )
  }
}

# TODO create "test_qc" function, which generates a qc flag and applies it to a
# variable of choice without adding it to the qc data frame
# option "plot = TRUE" to show a time series of the variable with points colored
# according to the tested flag

## =============================================================================
#' Write File Containing Quality Control Details
#'
#' @param data
#' @param siteyear
#' @param file_path
#'
#' @return
#' @export
#'
#' @examples
document_qc <- function(data, path) {
  #session_time <- format(Sys.time(), "%Y-%m-%d")
  #file_name <- paste0(siteyear, "_qc_details_", session_time, ".txt")
  n <- length(data)
  l <- list()
  for (i in 1:n) {
    l[[names(data)[i]]] <- attributes(data[, i])
  }
  capture.output(l, file = path)

  # Show effect of flux interdependency test on QC flags:
  # Interpretation:
  # 0 shows how many flag 0 were corrected to flag 1,
  # 1 shows overall change in flag 1
  # 2 shows how many flag 1 were corrected to flag 2,
  #abs(table(qc$qc_h_agg_inter) - table(qc$qc_h_agg_prelim))
  #abs(table(qc$qc_le_agg_inter) - table(qc$qc_le_agg_prelim))
  #abs(table(qc$qc_fco2_agg_inter) - table(qc$qc_fco2_agg_prelim))
}

## =============================================================================
#' Combine Quality Checking Results
#'
#' Combine quality checking results depending whether they have a fixed or
#' cumulative effect or any combination of these effects. It is also checked how
#' should \code{NA}s be interpreted.
#'
#' The quality checking results must be provided as a data frame with columns
#' containing quality flags resulting from individual tests/filters. For all
#' flags over all rows, the maximum and cumulative sum is taken for columns with
#' fixed and additive effect, respectively.
#'
#' Typical values of argument \code{na.as} are \code{NA}, \code{0} or \code{2}.
#' \code{NA} value is only formal and does not suggest any change in
#' interpretation. Value \code{0} is used in the case that the \code{NA} value
#' of the quality test/filter is an expected result and means that the half-hour
#' was not checked by or has a good quality according to the given test/filter
#' (e.g. \code{\link{despike}}). Value \code{2} would be used only if the user
#' wants to explicitly specify that \code{NA} flag value should not be used for
#' further analyses.
#'
#' @return A vector with attributes \code{varnames} and \code{units} is
#'   produced. \code{varnames} value is set by \code{name_out} argument and
#'   \code{units} value is set to default \code{"-"}.
#'
#' @param x A data frame with column names.
#' @param qc_names A vector of names of data frame columns to combine.
#' @param additive A vector of logical values (\code{TRUE} or \code{FALSE}) for
#'   the \code{qc_names} subset of \code{x} that determines if the flags should
#'   be treated as additive (\code{additive = TRUE}) or with fixed effect
#'   (\code{additive = FALSE}). If only one value is provided, all columns are
#'   considered to be of the same type. If \code{NULL} (default), values are
#'   determined for each of specified \code{qc_names} from their attribute
#'   \code{additive}.
#' @param na_as A vector of numeric or \code{NA} values for the \code{qc_names}
#'   subset of \code{x} that determines how should be the missing flags
#'   interpreted. If only one value is provided, all columns are treated the
#'   same way. If \code{NULL} (default), values are determined for each of
#'   specified \code{qc_names} from their attribute \code{na_as}.
#'
#' @export
combine_qc <- function(x, qc_names, additive = NULL, na_as = NULL) {
  x_names <- colnames(x)
  check_input(x, "data_frame")
  check_input(qc_names, "character")
  len <- length(qc_names)
  # Set additive and na_as according to qc flag attributes
  if (is.null(additive)) {
    additive <- vector(mode = "logical", length = len)
    for (i in 1:len) additive[i] <- attr(x[c(qc_names)][, i], "additive")
  }
  if (is.null(na_as)) {
    na_as <- vector(mode = "numeric", length = len)
    for (i in 1:len) na_as[i] <- attr(x[c(qc_names)][, i], "na_as")
  }
  check_input(
    additive, c("logical", "vector", "all_finite", "lengths"), qc_names
  )
  check_input(na_as, c("vector", "numeric_na"))
  if (length(na_as) > 1) {
    if (length(qc_names) != length(na_as)) {
      stop("'na_as' must be of same length as 'qc_names' or length 1.")
    }
  } else if (length(na_as) != length(qc_names)) {
    na_as <- rep(na_as, length(qc_names))
  }
  if (!all(qc_names %in% x_names)) {
    stop(paste(
      "missing", paste0(qc_names[!(qc_names %in% x_names)], collapse = ", ")
    ))
  }
  df <- x[c(qc_names)]
  if (length(qc_names) == 0) {
    return(df)
  }
  if (!all(is.na(na_as))) {
    for (i in seq_along(qc_names)) {
      df[is.na(df[i]), i] <- na_as[i]
    }
  }
  out <- df[, FALSE]
  if (any(additive)) {
    add <- df[additive]
    add_all_na <- all_na(add, 1)
    out$add[!add_all_na] <- rowSums(add[!add_all_na, , drop = FALSE])
  }
  if (any(!additive)) {
    abs <- df[!additive]
    abs_all_na <- all_na(abs, 1)
    out$abs[!abs_all_na] <- apply(abs[!abs_all_na, , drop = FALSE], 1, max)
  }
  out_all_na <- all_na(out, 1)
  out[!out_all_na, "name_out"] <- rowSums(out[!out_all_na, , drop = FALSE])
  out[, "name_out"][out["name_out"] > 2] <- 2L
  # Combined flags are not additive themselves
  attr(out[, "name_out"], "additive") <- FALSE
  attr(out[, "name_out"], "na_as") <- NA
  attr(out[, "name_out"], "combined_flags") <- qc_names
  return(out[, "name_out"])
}

combine_qc2 <- function(..., additive = NULL, na_as = NULL) {
  # Vectorized form of combine_qc
  # Capture dots
  dots <- rlang::enexprs(...)
  dots <- list(...)
  df <- as.data.frame(dots)
  len <- length(dots)
  # Get additive and na_as attributes
  if (is.null(additive)) {
    additive <- vector(mode = "logical", length = len)
    for (i in seq_along(dots)) additive[i] <- attr(df[[i]], "additive")
  }
  if (is.null(na_as)) {
    na_as <- vector(mode = "numeric", length = len)
    for (i in seq_along(dots)) na_as[i] <- attr(df[[i]], "na_as")
  }
  # Check inputs
  check_input(additive, c("logical", "vector", "all_finite"))
  check_input(na_as, c("vector", "numeric_na"))
  if (length(na_as) > 1) {
    if (len != length(na_as)) {
      stop("'na_as' must be of same length as 'qc_names' or length 1.")
    }
  } else if (length(na_as) != len) {
    na_as <- rep(na_as, len)
  }
  if (len == 0) {
    stop("No qc flags were passed to combine.")
  }
  if (!all(is.na(na_as))) {
    for (i in seq_along(dots)) {
      df[is.na(df[i]), i] <- na_as[i]
    }
  }
  # Combine flags
  out <- df[, FALSE]
  if (any(additive)) {
    add <- df[additive]
    add_all_na <- all_na(add, 1)
    out$add[!add_all_na] <- rowSums(add[!add_all_na, , drop = FALSE])
  }
  if (any(!additive)) {
    abs <- df[!additive]
    abs_all_na <- all_na(abs, 1)
    out$abs[!abs_all_na] <- apply(abs[!abs_all_na, , drop = FALSE], 1, max)
  }
  out_all_na <- all_na(out, 1)
  out[!out_all_na, "x"] <- rowSums(out[!out_all_na, , drop = FALSE])
  out[, "x"][out["x"] > 2] <- 2L
  # Combined flags are not additive themselves
  attr(out[, "x"], "additive") <- FALSE
  attr(out[, "x"], "na_as") <- NA
  attr(out[, "x"], "combined_flags") <- names(df)
  out[, "x"]
}

## =============================================================================
#' Combine Quality Checking Results by Dependencies
#'
#' Wrapper function that simplifies combining many qc flags for flux variables.
#' Flags are assigned to variables based on dependencies as defined in
#' \link{\code{get_deps}}.
#'
#' @param data
#' @param var
#' @param suffix_out
#' @param suffix_spec
#' @param deps
#' @param do_all
#'
#' @return
#' @export
#'
#' @export
combine_qc_deps <- function(data, var = c(
  "sa", "irga75", "irga77", "fco2", "fch4", "le", "h", "tau"
), suffix_out = "agg", suffix_spec = NULL, deps = get_deps(), do_all = FALSE) {
  check_input(data, "data_frame")
  check_input(suffix_out, "character")
  var <- match.arg(var)
  deps <- deps[[var]]
  # find flags that match _var_ OR _var
  regex <- stringr::str_c("_", deps, "(_|\\b)", collapse = "|")
  qc_names <- stringr::str_subset(names(data), regex)
  if (!is.null(suffix_spec)) {
    qc_names <- stringr::str_subset(qc_names, suffix_spec)
  }
  #browser()
  qc_subset <- data[, qc_names]
  len <- length(qc_subset)
  additive <- vector(mode = "logical", length = len)
  na_as <- vector(mode = "numeric", length = len)
  for (i in 1:len) {
    additive[i] <- attr(qc_subset[, i], "additive")
    na_as[i] <- attr(qc_subset[, i], "na_as")
  }
  name_out <- paste0("qc_", var, "_", suffix_out)
  out <- data
  out[, name_out] <- combine_qc(
    x = data, qc_names = qc_names, additive = additive, na_as = na_as
  )
  # Combined flags are not additive themselves
  attr(out, "additive") <- FALSE
  attr(out, "na_as") <- NA
  out
}
grahamstewart12/tidyflux documentation built on June 4, 2019, 7:44 a.m.