## =============================================================================
#' Flag Based on Static Threshold Values
#'
#' Values of \code{x} are checked against two specified thresholds to obtain
#' their quality control (qc) flags.
#'
#' This function is called by \code{\link{extract_qc}} but can be useful on its
#' own when filtering values of variable according to the 0 - 2 qc flag scheme.
#'
#' Obtained qc flags are assigned in the range 0 - 2 according to these rules:
#'
#' For \code{flag = "higher"}
#' \itemize{
#' \item If \code{x <= thr[1]}, QC flag = 0.
#' \item If \code{x > thr[1] & x <= thr[2]}, QC flag = 1.
#' \item If \code{x > thr[2]}, QC flag = 2.
#' }
#'
#' For \code{flag = "outside"}
#' \itemize{
#' \item If \code{x >= thr[1] & x <= thr[2]}, QC flag = 0.
#' \item If \code{x < thr[1] | x > thr[2]}, QC flag = 2.
#' }
#'
#' For \code{flag = "between"}
#' \itemize{
#' \item If \code{x <= thr[1] | x >= thr[2]}, QC flag = 0.
#' \item If \code{x > thr[1] & x < thr[2]}, QC flag = 2.
#' }
#'
#' For \code{flag = "lower"}
#' \itemize{
#' \item If \code{x >= thr[1]}, QC flag = 0.
#' \item If \code{x < thr[1] & x >= thr[2]}, QC flag = 1.
#' \item If \code{x < thr[2]}, QC flag = 2.
#' }
#'
#' @details
#' This is a modified version of the function of the same name in the R package
#' "openeddy".
#'
#' @return An integer vector with the same length as \code{x}. Its
#' \code{varnames} and \code{units} attributes are set to \code{name_out} and
#' \code{"-"} values, respectively.
#'
#' @param x A numeric atomic type with \code{NULL} \code{dimensions}.
#' @param thr A numeric vector with 2 non-missing values.
#' @param flag A character string. Selects one of the available flagging
#' approaches. Can be abbreviated. See 'Details'.
#'
flag_thr <- function(x, thr,
flag = c("higher", "outside", "between", "lower")) {
# Check inputs
check_input(x, c("numeric", "no_dims"))
check_input(thr, c("numeric", "length_2", "all_finite"))
flag <- match.arg(flag)
out <- rep(NA, length(x))
if (flag == "higher") {
if (thr[1] > thr[2]) stop("'thr[1]' cannot be higher than 'thr[2]'.")
out[x <= thr[1]] <- 0L
out[x > thr[1]] <- 1L
out[x > thr[2]] <- 2L
}
if (flag == "outside") {
if (thr[1] > thr[2]) stop("'thr[1]' cannot be higher than 'thr[2]'.")
out[x >= thr[1] & x <= thr[2]] <- 0L
out[x < thr[1] | x > thr[2]] <- 2L
}
if (flag == "between") {
if (thr[1] > thr[2]) stop("'thr[1]' cannot be higher than 'thr[2]'.")
out[x <= thr[1] | x >= thr[2]] <- 0L
out[x > thr[1] & x < thr[2]] <- 2L
}
if (flag == "lower") {
if (thr[1] < thr[2]) stop("'thr[1]' cannot be lower than 'thr[2]'.")
out[x >= thr[1]] <- 0L
out[x < thr[1]] <- 1L
out[x < thr[2]] <- 2L
}
attributes(out) <- list(thr = thr, flag = flag, additive = FALSE, na_as = NA)
return(out)
}
## =============================================================================
#' Flag Runs of Equal Values
#'
#' Identify and flag values of runs with repeating values in a vector.
#'
#' \code{NA} values are omitted before evaluation of runs. Thus \code{NA}s do
#' not interrupt runs. Flagging is done according to the 0 - 2 quality control
#' flag scheme.
#'
#' @param x A numeric atomic type with NULL dimensions.
#' @param length A numeric value.
#' @param ignore
#'
#' @details
#' This is a modified version of the function of the same name in the R package
#' "openeddy". Primary modifications: added support for ignoring runs of value
#' 0.
#'
#' @return An integer vector with the same length as \code{x}. Its
#' \code{varnames} and \code{units} attributes are set to \code{name_out} and
#' \code{"-"} values, respectively.
#'
flag_runs <- function(x, length = 2, ignore = NULL) {
# matrix and array is numeric - we do not want them:
check_input(x, c("numeric", "no_dims"))
check_input(length, "numeric_val")
out <- rep(NA, length(x))
not_NA <- !is.na(x)
x <- x[not_NA]
if (!is.null(ignore)) {
x <- x[x != ignore]
}
rl <- rle(x)$lengths
keep <- rl >= length
run_start <- (cumsum(rl) + 1 - rl)[keep]
run_end <- cumsum(rl)[keep]
df <- data.frame(run_start, run_end)
runs <- unlist(apply(df, 1, function(x) x[1]:x[2]))
out[not_NA] <- 0L
out[not_NA][runs] <- 2L
attributes(out) <- list(
length = length, ignore = ignore, additive = FALSE, na_as = NA
)
out
}
## =============================================================================
#' Extract Quality Control Information from Coded Values
#'
#' This function is called by \code{\link{extract_QC}} and is not inteded to be
#' used directly. QC information for multiple variables stored in a character
#' vector of certain properties is extracted and interpreted.
#'
#' Each element of \code{x} is expected to have the same structure according to
#' the format specified by \code{units} attribute of \code{x}. \code{units}
#' format string (e.g. \code{"8u/v/w/ts/h2o/co2"}) starts with a prefix
#' (\code{"8"}) that is followed by variable names (\code{"u", "v", "w", "ts",
#' "h2o", "co2"}) that are distinguished by a separator (\code{"/"}). Elements
#' of \code{x} thus consist of either seven components (prefix and six flags for
#' each variable) or one \code{NA} value. Flags can have three values:
#' \itemize{
#' \item Flag 0: measured variable passed the test.
#' \item Flag 1: measured variable failed the test.
#' \item Flag 9: flag could not be obtained for the given variable (\code{NA}).
#' }
#'
#' \code{x} is interpreted in respect to instruments required to measure given
#' fluxes. SA is required for measurements of all fluxes and solely provides
#' data for computation of Tau and H fluxes. IRGA is additionally required for
#' measurements of LE and NEE. To confirm the correct performance of SA, all
#' variables \code{"u", "v", "w", "ts"} must have flag 0.
#' In the case of IRGA, all variables \code{"h2o", "co2"} must have flag 0.
#' Results are reported according to the QC scheme using QC flag range 0 - 2..
#'
#' @return A data frame with columns \code{"sa"} and \code{"irga75"}. Each
#' column has attributes \code{"varnames"} and \code{"units"} and length equal
#' to that of \code{x}.
#'
#' @param x An atomic type.
#' @param prefix Character string containing a \code{\link{regular expression}}
#' identifying the prefix of coded values.
#' @param split Character string containing a \code{\link{regular expression}}
#' identifying the separator of variable names in \code{units} attribute.
#'
#' @keywords internal
extract_coded <- function(data, var, qc_suffix, prefix = "[8]", split = "[/]") {
x <- data[, var]
check_input(x, "atomic")
units <- tidyflux::units(x)
vars <- unlist(strsplit(gsub(prefix, "", units), split = split))
req_vars <- c("u", "v", "w", "ts", "h2o", "co2", "ch4")
if (!all(req_vars %in% vars)) {
stop(paste(
"Coded variables in units of coded vector are missing:",
paste0(req_vars[!(req_vars %in% vars)], collapse = ", "), "."
), call. = FALSE)
}
# Helper function - controls column splitting
separate <- function(x, ...) {
strsplit(as.character(gsub(..., "", x)), NULL)
}
l <- lapply(x, separate, prefix)
# Helper function - controls NAs in splitted columns
# 9 is internal code for NA. In case of missing record inserts NAs according
# to number of variables in 'units'
handle_na <- function(x, variables) {
out <- as.numeric(unlist(x))
out[out == 9] <- NA
length(out) <- length(variables)
return(out)
}
out <- as.data.frame(t(sapply(l, handle_na, vars)))
names(out) <- vars
if ("none" %in% names(out)) out$none <- NULL
#out$sa <- apply(out[c("u", "v", "w", "ts")], 1, max)
#out$irga75 <- apply(out[c("h2o", "co2")], 1, max)
# values above 0 are interpreted as flag 2 for given variable
out <- out %>% dplyr::mutate_all(dplyr::funs(dplyr::if_else(. == 1, 2L, 0L)))
for (i in seq_len(ncol(out))) {
colnames(out)[i] <- paste0("qc_", colnames(out[i]), "_", qc_suffix)
attributes(out[, i]) <- list(additive = FALSE, na_as = NA)
}
return(out)
}
## =============================================================================
#' Extract Quality Control Information
#'
#' QC information stored in columns of data frame \code{x} is extracted and
#' interpreted in respect to instruments or individual fluxes.
#'
#' The data frame \code{x} is expected to have certain properties. It is
#' required that it contains column names according to considered QC checks. See
#' 'Arguments' above. Attribute \code{units} is required for columns
#' \code{"absolute_limits_hf" and "spikes_hf"} to extract the coded QC
#' information. See \code{\link{extract_coded}}.
#'
#' Extracted QC information can be relevant to fluxes measured by given
#' instrument(s), specific flux or is applicable to all fluxes. See 'Naming
#' Strategy' below. Results are reported according to the QC scheme using QC
#' flag range 0 - 2. In cases when extracted variable is checked against
#' thresholds (missfrac, scf, wresid), \code{\link{flag_thr}} is used to assign
#' flag values. First value of \code{missfrac_thr}, \code{scf_thr},
#' \code{w_unrot_thr} or \code{w_rot_thr} sets threshold for flag 1, second
#' value sets threshold for flag 2. If both threshold values are the same, only
#' flag 0 and 2 will be resolved (hard flag).
#'
#' Check of missing data in averaging period (missfrac) takes into account
#' number of valid records used for given averaging period. This number is
#' further reduced by the sum of count of high frequency data spikes out of
#' variables needed to compute covariance. Covariance pairs are w, u (Tau); w,
#' ts (H); w, h2o (LE) and w, co2 (NEE).
#'
#' @section Extracted QC Checks:
#' \itemize{
#' \item Check of plausibility limits (abslim). Test is not additive.
#' \item Check of high frequency data spike percentage in averaging period
#' against thresholds (spikeshf). Test is not additive.
#' \item Check of missing data in averaging period against thresholds
#' (missfrac). Test is not additive.
#' \item Check of spectral correction factor against thresholds (scf). Test is
#' not additive.
#' \item Check of mean unrotated w (double rotation) or w residual (planar fit)
#' against thresholds (wresid). Additive test.
#' }
#'
#' @section Content and format of columns:
#' \itemize{
#' \item \code{"absolute_limits_hf"}: hard flags (passed or failed the test) for
#' individual variables for absolute limits. Limits for each variable are set
#' in the post-processing software which also reports the resulting flags in a
#' coded format. See \code{\link{extract_coded}}.
#' \item \code{"spikes_hf"}: hard flags (passed or failed the test) for
#' individual variables for spike test. Threshold for maximum allowed
#' percentage of spikes within averaging period for all variables is set in
#' the post-processing software which also reports the resulting flags in a
#' coded format. See \code{\link{extract_coded}}.
#' \item \code{"file_records"}: number of valid records found in the raw file
#' \item \code{"used_records"}: number of valid records used at given averaging
#' period. This number can also contain high frequency spikes that should be
#' excluded.
#' \item \code{"u_spikes", ts_spikes", "h2o_spikes" and "co2_spikes"}: number of
#' high frequency spikes detected at given averaging period in respective
#' variable. Values can be set to 0 if \code{"used_records"} already accounts
#' for spikes.
#' \item \code{"Tau_scf", "H_scf", "LE_scf" and "co2_scf"}: spectral correction
#' factor for given flux. Values are above 1.
#' \item \code{"w_unrot", "w_rot"}: unrotated and rotated w wind component,
#' respectively (should be close to 0).
#' }
#'
#' @references
#' Foken, T., Wichura, B., 1996. Tools for quality
#' assessment of surface-based flux measurements. Agric. For. Meteorol. 78,
#' 83–105. doi:10.1016/0168-1923(95)02248-1
#'
#' Mauder, M., Cuntz, M., Drue, C., Graf, A., Rebmann, C., Schmid, H.P.,
#' Schmidt, M., Steinbrecher, R., 2013. A strategy for quality and uncertainty
#' assessment of long-term eddy-covariance measurements. Agric. For. Meteorol.
#' 169, 122-135. doi:10.1016/j.agrformet.2012.09.006
#'
#' @return A data frame. Each column has attributes \code{"varnames"} and
#' \code{"units"} .
#'
#' @param x A data frame with column names representing required variables.
#' @param abslim A logical value. Determines whether plausibility limits check
#' should be considered. If \code{abslim = TRUE}, column
#' \code{"absolute_limits_hf"} is required in \code{x}.
#' @param spikesHF A logical value. Determines whether check of spike percentage
#' in averaging period should be considered. If \code{spikesHF = TRUE}, column
#' \code{"spikes_hf"} is required in \code{x}.
#' @param missfrac A logical value. Determines whether check of missing data in
#' averaging period against thresholds should be done.
#' @param scf A logical value. Determines whether check of spectral correction
#' factor against thresholds should be done. If \code{scf = TRUE}, columns
#' \code{"tau_scf", "h_scf", "le_scf" and "co2_scf"} are required in \code{x}.
#' @param wresid A logical value. Determines whether check of mean unrotated w
#' and w residual after planar fit against thresholds should be done. If
#' \code{wresid = TRUE}, columns \code{"w_unrot"} and \code{"w_rot"} are
#' required in \code{x}.
#' @param rotation A character string. Specifies the type of coordinate rotation
#' applied. Allowed values are "double" and "planar fit". Can be abbreviated.
#' @param prefix Character string containing a \code{\link{regular expression}}
#' identifying the prefix of coded values. See \code{\link{extract_coded}}.
#' @param split Character string containing a \code{\link{regular expression}}
#' identifying the separator of variable names in \code{units} attribute. See
#' \code{\link{extract_coded}}.
#' @param missfrac_thr A numeric vector with 2 non-missing values. Represents
#' thresholds (allowed fraction of missing high frequency data within
#' averaging period) used if \code{missfrac = TRUE}.
#' @param scf_thr A numeric vector with 2 non-missing values. Represents
#' thresholds used if \code{scf = TRUE}.
#' @param w_unrot_thr A numeric vector with 2 non-missing values. Represents
#' thresholds for unrotated w used if \code{wresid = TRUE}.
#' @param w_rot_thr A numeric vector with 2 non-missing values. Represents
#' thresholds for rotated w used if \code{wresid = TRUE}.
#'
extract_qc <- function(data, abslim = TRUE, spikeshf = TRUE, missfrac = TRUE,
sk = TRUE, scf = TRUE, wresid = TRUE, signal = TRUE,
precip = TRUE, rotation = c("double", "planar"),
prefix = "[8]", split = "[/]",
missfrac_thr = c(0.1, 0.1), scf_thr = c(2, 3),
wunrot_thr = c(0.35, 0.35), wrot_thr = c(0.1, 0.15),
rssi75_thr = c(90, 70), rssi77_thr = c(15, 10),
precip_thr = c(0, 0)) {
# Basic check of input =======================================================
data_names <- colnames(data)
check_input(data, "data_frame")
req_vars <- character()
if (abslim) req_vars <- c(req_vars, "absolute_limits_hf")
if (spikeshf) req_vars <- c(req_vars, "spikes_hf")
if (missfrac) {
req_vars <- c(
req_vars, "file_records", "used_records", "u_sphf", "w_sphf",
"ts_sphf", "co2_sphf", "h2o_sphf", "ch4_sphf"
)
}
if (scf) {
req_vars <- c(req_vars, "tau_scf", "h_scf", "le_scf", "co2_scf", "ch4_scf")
}
if (wresid) req_vars <- c(req_vars, "wunrot", "wrot")
if (signal) req_vars <- c(req_vars, "rssi75", "rssi77")
if (precip) req_vars <- c(req_vars, "p")
if (length(req_vars) > 0 && !all(req_vars %in% data_names)) {
stop(paste(
"Missing",
paste0(req_vars[!(req_vars %in% data_names)], collapse = ", "), "."
), call. = FALSE)
}
units <- tidyflux::units(data, names = TRUE)
if (abslim && units["absolute_limits_hf"] == "-") {
stop(
"Missing 'absolute_limits_hf' format in its units attribute.",
call. = FALSE
)
}
if (spikeshf && units["spikes_hf"] == "-") {
stop("Missing 'spikes_hf' format in its units attribute.", call. = FALSE)
}
if (sk && units["skewness_kurtosis_hf"] == "-") {
stop(
"Missing 'skewness_kurtosis_hf' format in its units attribute.",
call. = FALSE
)
}
# Create output dataframe for saving qc flags
out <- data[, 0]
# Extract any qc flags that already exist in input dataset
qc_names <- stringr::str_subset(names(data), "qc")
qc_df <- out[, 0]
qc_df[, qc_names] <- data[, qc_names]
for (i in seq_len(ncol(qc_df))) {
attributes(qc_df[, i]) <- list(additive = FALSE, na_as = NA)
}
out <- dplyr::bind_cols(out, qc_df)
# abslim creates individual flags based on EddyPro absolute_limits_hf column =
if (abslim) {
abslim_df <- extract_coded(
data, "absolute_limits_hf", "abslimhf", prefix, split
)
out <- dplyr::bind_cols(out, abslim_df)
}
# sk creates individual flags based on EddyPro skewness_kurtosis_hf column ===
if (sk) {
sk_df <- extract_coded(data, "skewness_kurtosis_hf", "sk", prefix, split)
out <- dplyr::bind_cols(out, sk_df)
}
# spikeshf creates individual flags based on EddyPro spikes_hf column ========
if (spikeshf) {
sphf_df <- extract_coded(data, "spikes_hf", "sphf", prefix, split)
out <- dplyr::bind_cols(out, sphf_df)
# TODO: >100 spikes filtering (Mammarella2014)
}
# missfrac creates flag for records with too few high frequency readings =====
if (missfrac) {
# Flag according to argument 'missfrac_thr' (missing fraction thresholds):
# missfrac > missfrac_thr[1]: flag = 1, missfrac > missfrac_thr[2]: flag = 2
if (!all(is.na(data$file_records))) {
ur <- data$used_records
mfr <- max(data$file_records, na.rm = TRUE) # maximum file records (mfr)
# mf computes missing fraction for particular flux
mff <- function(data, ur, mfr) {
1 - (ur - apply(data, 1, sum, na.rm = TRUE)) / mfr
}
mf_df <- out[, 0]
mf_vars <- c("u", "v", "w", "ts", "h2o", "co2", "ch4")
for (i in seq_along(mf_vars)) {
sphf <- paste0(mf_vars[i], "_sphf")
mf <- mff(data[sphf], ur, mfr)
name <- paste0("qc_", mf_vars[i], "_mf")
mf_df[, name] <- flag_thr(mf, missfrac_thr)
}
out <- dplyr::bind_cols(out, mf_df)
} else {
warning("'data$file_records' is all NA, skipped missfrac.", call. = FALSE)
}
}
# scf creates flag for records with excessive spectral correction ============
if (scf) {
# Flag according to argument 'scf_thr' (spectral correction factor
# thresholds): scf > scf_thr[1]: flag = 1, scf > scf_thr[2]: flag = 2
nin <- c("tau_scf", "h_scf", "le_scf", "co2_scf", "ch4_scf")
nout <- c("qc_tau_scf", "qc_h_scf", "qc_le_scf", "qc_nee_scf", "qc_ch4_scf")
scf_df <- out[, 0]
for (i in seq_along(nout)) {
scf_df[, nout[i]] <- flag_thr(data[, nin[i]], scf_thr)
}
out <- dplyr::bind_cols(out, scf_df)
}
# wresid creates overall flag for records with probable advection ============
if (wresid) {
rotation <- match.arg(rotation)
message(paste("wresid:", rotation, "rotation: using",
ifelse(rotation == "double", "wunrot_thr", "wrot_thr")))
if (rotation == "double") {
# In case of double rotation abs(wunrot) should be < 0.35 m/s
out$qc_w_resid <- flag_thr(abs(data$wunrot), wunrot_thr)
} else {
# Flag correction according to residual absolute w after planar fit
# abs(w) > 0.10 m s-1: flag incresead by +1, abs(w) > 0.15 m s-1: flag = 2
out$qc_w_resid <- flag_thr(abs(data$wrot), wrot_thr)
}
}
# signal creates flag for records below sensor stability limits ==============
if (signal) {
out$qc_irga75_ss <- flag_thr(data$rssi75, rssi75_thr, "lower")
out$qc_irga77_ss <- flag_thr(data$rssi77, rssi77_thr, "lower")
}
# precip creates overall flag for records with rain interference =============
if (precip) {
out$qc_all_p <- flag_thr(data$p, precip_thr)
# temporary fix to flag p events AND records immediately after p events
out$qc_all_p_lead <- flag_thr(dplyr::lag(data$p), precip_thr)
attr(out$qc_all_p_lead, "note") <- "flagged if lag(qc_all_p) = 2"
}
return(out)
}
## =============================================================================
#' Apply Flux Interdependency Quality Control Scheme
#'
#' Interdependency of H, LE and NEE QC flags due to corrections/conversions.
#'
#' Flux interdependency is an additive QC flag correction. Results follow the QC
#' scheme using QC flag range 0 - 2. Returned data frame follows the 'Naming
#' Strategy' described in \code{\link{extract_QC}}. Returned QC flags have QC
#' suffix interdep.
#'
#' To convert buoyancy flux to sensible heat flux (SND or Schotanus correction),
#' reliable measurements of LE must be available. To correct LE and NEE
#' estimated by open-path IRGA (\code{IRGA = "open"}) for the effects of density
#' fluctuations due to temperature and humidity fluctuations (WPL or Webb
#' correction), reliable measurements of H must be available. To perform WPL
#' correction of NEE estimated with any kind of IRGA, reliable measurements of
#' LE must be available. Thus following set of rules apply:
#'
#' If \code{IRGA = "en_closed"} \itemize{ \item If \code{qc_LE == 2 |
#' is.na(qc_LE) == TRUE}: qc_H and qc_NEE flags are increased by 1.} If
#' \code{IRGA = "open"} \itemize{ \item if \code{qc_LE == 2 | is.na(qc_LE) ==
#' TRUE}: qc_H flags are increased by 1. \item If \code{qc_H == 2 | is.na(qc_H)
#' == TRUE}: qc_LE flags are increased by 1. \item If \code{qc_H == 2 |
#' is.na(qc_H) == TRUE | qc_LE == 2 | is.na(qc_LE) == TRUE}: qc_NEE flags are
#' increased by 1.}
#'
#' @section References: Mauder, M., Cuntz, M., Drue, C., Graf, A., Rebmann, C.,
#' Schmid, H.P., Schmidt, M., Steinbrecher, R., 2013. A strategy for quality
#' and uncertainty assessment of long-term eddy-covariance measurements.
#' Agric. For. Meteorol. 169, 122-135. doi:10.1016/j.agrformet.2012.09.006
#'
#' @return A data frame. Each column has attributes \code{"varnames"} and
#' \code{"units"}.
#'
#' @param qc_le An atomic type containing numeric or NA values. Combination of
#' all available QC flags for LE.
#' @param qc_h An atomic type containing numeric or NA values. Combination of
#' all available QC flags for H. Used only if \code{irga = "open"}.
#' @param irga A character string specifying the type of irga. Allowed values
#' are \code{"en_closed"} both for closed and enclosed path systems and
#' \code{"open"} for open path systems. Can be abbreviated.
#'
interdep <- function(data, qc_le, qc_h = NULL, irga = c("open", "en_closed")) {
qc_le <- data[, qc_le]
if (!is.null(qc_h)) qc_h <- data[, qc_h]
check_input(qc_le, c("atomic", "numeric_na"))
check_input(qc_h, "atomic")
irga <- match.arg(irga)
if (irga == "open") {
check_input(qc_h, "numeric_na")
}
len <- length(qc_le)
if (irga == "open" && len != length(qc_h)) {
stop("length(qc_le) and length(qc_h) must be equal.")
}
# qc_h is influencing variable only for open path system
nout <- c("qc_h_inter", "qc_le_inter", "qc_fco2_inter")
out <- data.frame(rep(NA, len), rep(NA, len), rep(NA, len))
names(out) <- nout
for (i in nout) {
attributes(out[, i]) <- list(additive = TRUE, na_as = NA)
}
if (length(qc_le) == 0) {
if (irga != "open") out$qc_le_inter <- NULL
return(out)
}
qc_le[is.na(qc_le)] <- 2L
out$qc_h_inter[qc_le < 2] <- 0L
out$qc_h_inter[qc_le >= 2] <- 1L
if (irga == "open") {
qc_h[is.na(qc_h)] <- 2L
out$qc_le_inter[qc_h < 2] <- 0L
out$qc_le_inter[qc_h >= 2] <- 1L
out$qc_fco2_inter[qc_le < 2 & qc_h < 2] <- 0L
out$qc_fco2_inter[qc_le >= 2 | qc_h >= 2] <- 1L
} else {
out$qc_le_inter <- NULL
out$qc_fco2_inter[qc_le < 2] <- 0L
out$qc_fco2_inter[qc_le >= 2] <- 1L
}
return(out)
}
## =============================================================================
#' Apply despiking to a given subset
#'
#' This is a low level function not intended to be used on its own. It is
#' utilized by \code{\link{despike}} that should be used instead.
#'
#' @param block_1 The double-differenced \code{var} time series with appropriate
#' block size.
#' @param block_2 Serves for computing despiking threshold and can be from the
#' same block as \code{block_1} (default) or from the whole year (more
#' conservative when less data available).
#' @param ref_block_1 \code{var} values from appropriate block for finding
#' false-flagged spikes by comparison with scaled median absolute deviation.
#' @param ref_block_2 \code{var} values for computing median and mad from the
#' same block as \code{ref_block_1} (default) or whole year (more conservative
#' when less data available).
#' @param z A numeric value. \eqn{MAD} scale factor.
#' @param c A numeric value. \code{\link{mad}} scale factor. Default is \code{3
#' * \link{mad} constant} (\code{i.e. 3 * 1.4826 = 4.4478}).
#'
#' @return A logical vector. \code{TRUE} values flag spikes.
#'
#' @keywords internal
desp <- function(b1, b2 = b1, rb1, rb2 = rb1, z, c) {
mb <- median(b2, na.rm = TRUE) # med block
br <- z * median(abs(b2 - mb), na.rm = TRUE) / 0.6745 # bracket
md <- median(rb2, na.rm = TRUE) # med ref
mad <- mad(rb2, constant = c, na.rm = TRUE) # mad ref
out <- list(
spike = (
(b1 < (mb - br) | b1 > (mb + br)) & (rb1 > (md + mad) | rb1 < (md - mad))
),
spike_all = (b1 < (mb - br) | b1 > (mb + br)),
stats = data.frame(
med_block = mb, scaled_mad = br,
med_ref = md, mad_ref = mad
)
)
is.na(out$spike) <- is.na(b1) # cannot check for spikes if d[i] missing
# this happens for first and last value; has to be treated because of "&"
return(out)
}
## =============================================================================
#' Apply despiking to all data blocks
#'
#' This is a low level function not intended to be used on its own. It is
#' utilized by \code{\link{despike}} that should be used instead.
#'
#' @param sd_sub A data frame prepared by \code{\link{despike}} with expected
#' columns index, date, timestamp, var, spike and light This is a subset of
#' data (\code{x}) provided to \code{\link{despike}} containing only the
#' data of good quality.
#' @param date An unsubsetted vector of class \code{"Date"} extracted from data
#' frame \code{x} fed to \code{\link{despike}}.
#' @param n A numeric value. Number of values within 13-day blocks required to
#' obtain robust statistics.
#' @param z A numeric value. \eqn{MAD} scale factor.
#' @param c A numeric value. \code{\link{mad}} scale factor. Default is \code{3
#' * \link{mad} constant} (\code{i.e. 3 * 1.4826 = 4.4478}).
#' @param plot A logical value. If \code{TRUE}, list of \code{\link{ggplot}}
#' objects visualizing the spikes is also produced.
#'
#' @return If \code{plot = FALSE}, an updated data frame \code{sd_sub} with
#' identified spikes in column Spike and with three new columns var_minus,
#' var_plus and diff. If \code{plot = TRUE}, a list with elements \code{SD} (a
#' data frame identical to the one produced if \code{plot = FALSE}) and
#' \code{plots} containing a list of \code{ggplot} objects.
#'
#' @keywords internal
desp_loop <- function(sd_sub, date, n, z, c, plot = FALSE) {
sd_sub$var_minus <- c(NA, sd_sub$var[-nrow(sd_sub)])
sd_sub$var_plus <- c(sd_sub$var[-1], NA)
sd_sub$diff <- with(sd_sub, (var - var_minus) - (var_plus - var))
if (sum(!is.na(sd_sub$diff)) <= n) {
stop("Number of values in subset is below n.", call. = FALSE)
}
sd_date <- date[1]
if (plot) {
i <- 1L
plots <- list()
}
while (sd_date <= date[length(date)]) {
# 13 day block filter
block <- sd_sub$date >= sd_date & sd_sub$date < sd_date + 13
if (sum(block, na.rm = TRUE) > n) {
desp_res <- desp(
b1 = sd_sub$diff[block],
rb1 = sd_sub$var[block], z = z, c = c
)
sd_sub$spike[block] <- desp_res$spike
} else if (sum(block, na.rm = TRUE) == 0) {
sd_date <- sd_date + 13
next
} else {
desp_res <- desp(
b1 = sd_sub$diff[block], b2 = sd_sub$diff,
rb1 = sd_sub$var[block], rb2 = sd_sub$var, z = z, c = c
)
sd_sub$spike[block] <- desp_res$spike
}
if (plot) {
x <- na.omit(sd_sub[block, c("timestamp", "diff", "var")])
x <- reshape2::melt(x, id = "timestamp")
ts_range <- range(sd_sub[block, "timestamp"])
y_range_d <- with(desp_res$stats, med_block + c(-scaled_mad, scaled_mad))
y_range_f <- with(desp_res$stats, med_ref + c(-mad_ref, mad_ref))
d <- data.frame(
variable = levels(x$variable),
yintercept = c(desp_res$stats$med_block, desp_res$stats$med_ref),
xmin = rep(ts_range[1], 2),
xmax = rep(ts_range[2], 2),
ymin = c(y_range_d[1], y_range_f[1]),
ymax = c(y_range_d[2], y_range_f[2])
)
xs1 <- sd_sub[block, ][desp_res$spike_all, c("timestamp", "diff")]
xs2 <- sd_sub[block, ][desp_res$spike, c("timestamp", "var")]
xs <- na.omit(
reshape2::melt(merge(xs1, xs2, all = TRUE), id = "timestamp")
)
plots[[i]] <- ggplot2::ggplot(x, ggplot2::aes(timestamp, value)) +
ggplot2::geom_point() + ggplot2::geom_line() +
ggplot2::facet_grid(variable ~ ., scales = "free_y") +
ggplot2::ggtitle(paste(sd_date, "-", sd_date + 12)) +
ggplot2::theme(plot.title = ggplot2::element_text(hjust = 0.5)) +
ggplot2::geom_rect(data = d, ggplot2::aes(xmin = xmin, xmax = xmax,
ymin = ymin, ymax = ymax),
inherit.aes = FALSE, alpha = 1/5) +
ggplot2::geom_hline(data = d, ggplot2::aes(yintercept = yintercept)) +
ggplot2::geom_point(data = xs, ggplot2::aes(x = timestamp, y = value,
colour = variable))
names(plots)[i] <- paste(sd_date, "-", sd_date + 12)
i <- i + 1L
}
sd_date <- sd_date + 13
}
if (plot) return(list(sd = sd_sub, plots = plots))
return(sd_sub)
}
## =============================================================================
#' Low Frequency Data Despiking
#'
#' Scaled median absolute deviation from the median is applied to
#' double-differenced time series to identify outliers.
#'
#' Low Frequency Data Despiking is not an additive quality control (QC) test.
#' \code{despike} follows the QC scheme using QC flag range 0 - 2.
#' \code{varnames} attribute of returned vector should be chosen to follow the
#' 'Naming Strategy' described in \code{\link{extract_QC}}, i.e. to be
#' distinguished by suffix \code{"_spikesLF"}.
#'
#' The data frame \code{x} is expected to have certain properties. It is
#' required that it contains column named \code{"timestamp"} of class
#' \code{"POSIXt"} with regular sequence of date-time values, typically with
#' (half-)hourly frequency. Any missing values in \code{"timestamp"} are not
#' allowed. Thus, if no records exist for given date-time value, it still has to
#' be included. It also has to contain required (depends on the argument values)
#' column names. If QC flags are not available for \code{var}, \code{qc_flag}
#' still has to be included in \code{x} as a named column with all values set to
#' \code{0} (i.e. all values will be checked for outliers).
#'
#' Only non-missing \code{var} values with corresponding \code{qc_flag} values
#' below \code{2} are used to detect the outliers. Missing \code{var} values or
#' those with assigned flag \code{2} or \code{NA} are not checked and marked by
#' \code{NA} flag in the output. Thus \code{NA} values of \code{despike} should
#' be considered as not checked records and therefore interpreted as \code{0}
#' flag within the \code{0 - 2} quality control scheme.
#'
#' \code{var_thr} is intended for exclusion of data clearly outside of
#' theoretically acceptable range for the whole dataset. If \code{var_thr} is
#' specified, \code{var} values below \code{var_thr[1]} and above
#' \code{var_thr[2]} are marked as spikes (flag 2) in the output. Such values
#' are further not used for computing statistics on double-differenced time
#' series.
#'
#' \code{light} and \code{night_thr} are intended to separate data to night and
#' day subsets with different statistical properties. \code{NA}s in
#' \code{x[light]} are thus not allowed due to the subsetting. Despiking is then
#' applied to individual subsets and combined QC flags are returned.
#'
#' Despiking is done within blocks of 13 consecutive days to account for
#' seasonality of measured variable. Within each block, all records are compared
#' with its neighbours and \eqn{d[i]} scores are produced. This is achieved by
#' double-differencing: \deqn{d[i] = (var[i] - var[i-1]) - (var[i+1] - var[i])}
#' In order to obtain maximum amount of \eqn{d[i]} scores, all missing
#' \code{var} values are removed from the block before \eqn{d[i]} scores are
#' produced. \code{var} values are marked as spikes if \eqn{d[i]} is higher
#' (lower) than median of \eqn{d[i]} scores (\eqn{M[d]}) + (-) scaled median
#' absolute deviation: \deqn{d[i] > M[d] + (z * MAD / 0.6745)} \deqn{d[i] < M[d]
#' - (z * MAD / 0.6745)} MAD is defined as: \deqn{MAD = median(abs(d[i] -
#' M[d]))}
#'
#' The algorithm tends to flag also values that are neighbours of spikes. To
#' prevent false flagging, \code{\link{median}} and \code{\link{mad}} of
#' \code{var} values within given block (\eqn{M[var]} and \eqn{mad[var]},
#' respectively) is computed. Values can be marked as spikes only if
#' \deqn{var[i] > M[var] + (c * mad / 1.4826)} or \deqn{var[i] < M[var] - (c *
#' mad / 1.4826)}
#'
#' Number of available double-differenced \code{var} values (\code{nVals}) is
#' checked within each block. If equal or below \code{nVals}, \eqn{d[i]} or
#' \eqn{var[i]} values are checked against the statistics computed using entire
#' dataset to ensure robustness.
#'
#' The whole process is repeated iteratively if \code{iter > 1}. This way new
#' statistics are produced for each iteration after exclusion of already
#' detected outliers and new spikes can be identified.
#'
#' @section Plotting: Plots are produced as a list of \code{ggplot} objects.
#' Thus they can be assigned to an object and modified as needed before actual
#' plotting. Each plot consists of two panels. The upper one shows the
#' double-differenced time series, the bottom one the actual \code{var}
#' values. Grey bands mark the respective intervals in which \code{var} value
#' cannot be considered as an outlier. The red points in upper panel show all
#' points that would be marked as spikes if \code{c = 0}. Only the points
#' marked by blue color (bottom panel) will be considered spikes. The spike
#' detection tolerance (width of grey bands) can be modified by scale factors
#' \code{z} (upper panel) and \code{c} (bottom panel).
#'
#' @section References: Mauder, M., Cuntz, M., Drue, C., Graf, A., Rebmann, C.,
#' Schmid, H.P., Schmidt, M., Steinbrecher, R., 2013. A strategy for quality
#' and uncertainty assessment of long-term eddy-covariance measurements.
#' Agric. For. Meteorol. 169, 122-135. doi:10.1016/j.agrformet.2012.09.006
#'
#' Papale, D., Reichstein, M., Canfora, E., Aubinet, M., Bernhofer, C.,
#' Longdoz, B., Kutsch, W., Rambal, S., Valentini, R., Vesala, T., Yakir, D.,
#' 2006. Towards a more harmonized processing of eddy covariance CO2 fluxes:
#' algorithms and uncertainty estimation. Biogeosciences Discuss. 3, 961-992.
#' doi:10.5194/bgd-3-961-2006
#'
#' @return If \code{plot = FALSE}, an integer vector with attributes
#' \code{"varnames"} and \code{"units"}. If \code{plot = TRUE}, a list with
#' elements \code{SD} and \code{plots}. \code{SD} contains identical output as
#' produced when \code{plot = FALSE}, \code{plots} contains list of
#' \code{ggplot} objects for respective iteration, \code{light} subset and
#' 13-day period.
#'
#' Side effect: the counts of spikes detected in each iteration are printed to
#' console.
#'
#' @param x A data frame with column names representing required variables. See
#' 'Details' below.
#' @param var A character string. Specifies the variable name in \code{x} with
#' values to be despiked.
#' @param qc_flag A character string. Specifies the column name in \code{x} with
#' \code{var} related quality control flag.
#' @param var_thr A numeric vector with 2 non-missing values. Specifies fixed
#' thresholds for \code{var} values. Values outside this range will be flagged
#' as spikes (flag 2). If \code{var_thr = NULL}, thresholds are not applied.
#' @param iter An integer value. Defines number of despiking iterations.
#' @param plot A logical value. If \code{TRUE}, list of \code{\link{ggplot}}
#' objects visualizing the spikes is also produced.
#' @param light A character string. Selects preferred variable for incoming
#' light intensity. \code{"PAR"} or \code{"Rg"} is allowed. Can be
#' abbreviated. If \code{light = NULL}, \code{var} values are not separated to
#' night/day subsets and \code{night_thr} is not used.
#' @param night_thr A numeric value that defines the threshold between night
#' (for \code{light} values equal or lower than \code{night_thr}) and day (for
#' \code{light} values higher than \code{night_thr}) for incoming light.
#' @param n A numeric value. Number of values within 13-day blocks required to
#' obtain robust statistics.
#' @param z A numeric value. \eqn{MAD} scale factor.
#' @param c A numeric value. \code{\link{mad}} scale factor. Default is \code{3
#' * \link{mad} constant} (\code{i.e. 3 * 1.4826 = 4.4478}).
#'
#' @export
despike <- function(x, timestamp, qc_flag = NULL, var_thr = NULL,
only_thr = FALSE, iter = 10, plot = FALSE,
light = rg, night_thr = 12, n = 50, z = 5, c = 4.4478) {
# Check inputs
check_input(x, c("atomic", "numeric_na"))
if (is.null(qc_flag)) qc_flag <- rep(0, length(x))
check_input(qc_flag, c("atomic", "numeric_na"))
if (!is.null(light)) {
check_input(light, c("atomic", "numeric_na"))
check_input(night_thr, c("numeric", "length_1"))
}
check_input(timestamp, c("posixt", "all_finite"))
diff <- diff(as.numeric(timestamp))
if (any(diff != mean(diff))) {
stop("Timestamp does not form regular sequence.")
}
if (iter[1] < 1) stop("Set 'iter' to 1 or higher.")
date <- lubridate::date(timestamp)
qc_flag[is.na(qc_flag)] <- 2L # NA qc is interpreted as flag 2
if (!is.null(light)) {
sun <- light
if (anyNA(sun)) {
stop("NAs in x['light'] not allowed. Use gap-filling then proceed.")
}
}
# Filter for used data (qc not flag 2 or NA and var is not NA)
use <- qc_flag < 2 & !is.na(x)
# Introduce Spike flag, set a fixed threshold and update filter
out <- rep(NA, length(x)) # Flag NA: if not changed it means not tested
cat(paste0("Despiking ", substitute(x), "\n"))
if (!is.null(var_thr)) {
check_input(var_thr, c("numeric", "length_2", "all_finite"))
if (var_thr[1] > var_thr[2]) {
stop("'var_thr[1]' cannot be higher than 'var_thr[2]'.")
}
out[((x < var_thr[1]) | (x > var_thr[2])) & use] <- 2L
n_unused <- length(out[((x < var_thr[1]) | (x > var_thr[2])) & use])
cat(paste0("Excluded due to exceeded threshold: ", n_unused, "\n"))
if (only_thr) {
attributes(out) <- list(
var_thr = var_thr, only_thr = only_thr, additive = FALSE, na_as = 0
)
return(out) # Stop and return if only using thresholds to detect spikes
}
use <- use & is.na(out) # Flag NA at this point = eligible for despiking
} else n_unused <- 0
# Spike Detection (Papale et al. 2006)
# Create data frame with dates, x & sun intensity values
sd_df <- data.frame(
index = seq_len(length(x)), date = date,
timestamp = timestamp, var = x, spike = out
)
if (!is.null(light)) {
sd_df$light <- sun
}
if (plot) plots <- list()
ts <- 0
for (i in seq_len(iter)) {
# Filter out all low quality data and create continuous NEE time series
# sd_df after filtering keeps only records to be tested
# On first iteration sd_df$spike contains only NAs (vals not checked yet)
# During following iterations sd_df$spike:
# - has 2 (light = NULL) or 4 (night/day) NAs (boundaries of subsets)
# - otherwise equals 0 (spikes so far not detected)
# sd_df is used to merge results from sd_l after despiking
# sd_df$spike is a subset of out (out needs to be matched by sd_df$index)
# length(sd_df$spike) decreases with iterations
sd_df <- sd_df[use, ]
sd_l <- list(all = sd_df)
if (!is.null(light)) {
night <- sd_l$all$light <= night_thr # filter to distinguish night/day
sd_l <- list(night = sd_l$all[night, ], day = sd_l$all[!night, ])
}
# SD_l data frames have 3 more columns after despiking
if (plot) {
temp <- lapply(sd_l, desp_loop, date, n, z, c, plot)
sd_l <- lapply(temp, "[[", "sd") # overwritten every iteration
plots[[i]] <- lapply(temp, "[[", "plots") # all iteration results kept
names(plots)[i] <- paste("iter", i)
} else {
sd_l <- lapply(sd_l, desp_loop, date, n, z, c, plot)
}
sd_l <- lapply(sd_l, function(x) {
x$spike[x$spike == TRUE] <- 2L
return(x)
})
# Export results from spike detection list into the data frame
if (!is.null(light)) {
sd_df$spike[match(sd_l$night$index, sd_df$index)] <- sd_l$night$spike
sd_df$spike[match(sd_l$day$index, sd_df$index)] <- sd_l$day$spike
} else {
sd_df$spike <- sd_l$all$spike
}
# Export the results into the output vector
out[sd_df$index] <- sd_df$spike
# Count the number of detected spikes in this iteration and report them
ns <- sum(sd_df$spike == 2, na.rm = TRUE)
ts <- ts + ns
cat(paste0("iter ", i, ": ", ns, "\n"))
if (!ns && i < iter) {
cat("Further iterations omitted\n")
cat(paste0("Total records flagged: ", ts + n_unused, "\n"))
break
}
# Update use filter - it has to fit the already subsetted SD_df
# It has different length for each iteration
use <- sd_df$spike == 0 | is.na(sd_df$spike)
}
if (plot) return(list(sd = out, plots = plots))
attributes(out) <- list(
var_thr = var_thr, only_thr = only_thr, iter = iter,
night_thr = night_thr, n = n, z = z, c = c, additive = FALSE, na_as = 0
)
if (!is.null(light)) attr(out, "light") <- varnames(light)
return(out)
}
## =============================================================================
#' Apply Fetch Filter
#'
#' \code{fetch_filter} flags all halfhours that have longer fetch distance (of
#' given percentage of contribution to the flux) than the user defined boundary
#' of the region of interest (ROI).
#'
#' Fetch distance is used together with wind direction information to identify
#' the cases when fetch reached beyond ROI.
#'
#' The spatial extent of the studied ecosystem (ROI) is specified by its
#' \code{ROI_boundary} that describes the distance from eddy covariance tower to
#' the edge of the studied ecosystem. \code{ROI_boundary} has following
#' properties: \itemize{ \item the number of circular sectors is the same as the
#' number of provided distances; \item the angular resolution of the ROI
#' boundary can be computed as 360 degrees / number of angular sectors; \item
#' the ROI boundary distances are assigned to the centers of their respective
#' circular sectors with first sector centered on 0 degrees.}
#'
#' Example: \code{ROI_boundary} specified as c(150, 200, 250, 300) has following
#' properties: \itemize{ \item 4 circular sectors with 90° angular resolution;
#' \item ROI boundary is specified for the whole first sector (315°, 45°] at the
#' distance 150 m from tower (center of the sector is 0°); \item boundary of the
#' second sector (45°, 135°] is at the distance 200 m; \item third sector (135°,
#' 225°] is at the distance 250 m; \item fourth sector (225°, 315°] is at the
#' distance 300 m.}
#'
#' @return An integer vector with attributes \code{"varnames"} and
#' \code{"units"}.
#'
#' @param x A data frame with column names representing required variables.
#' @param fetch_name A character string. Specifies the column name in \code{x}
#' with fetch distance values.
#' @param wd_name A character string. Specifies the column name in \code{x} with
#' wind direction values.
#' @param ROI_boundary A numeric vector. Represents the boundary of region of
#' interest.
#'
#' @export
flag_fetch <- function(fetch, wd, roi_boundary) {
# Check inputs
check_input(fetch, "numeric_na")
check_input(wd, "numeric_na")
check_input(roi_boundary, "numeric")
sector_span <- 360 / length(roi_boundary)
sector <- round(wd / sector_span) + 1
sector[sector == (length(roi_boundary) + 1)] <- 1
out <- rep(NA, length(x))
out[fetch <= roi_boundary[sector]] <- 0L
out[fetch > roi_boundary[sector]] <- 2L
attributes(out) <- list(
fetch = varnames(fetch), roi_boundary = roi_boundary, additive = FALSE,
na_as = NA
)
out
}
## =============================================================================
#' Check and Set Quality Flag
#'
#' Generate new vector from data and quality flag column. Adapted from fSetQF in
#' the package REddyProc.
#'
#' @param data A data frame.
#' @param var A numeric vector, the variable to be filtered.
#' @param qc_var A numeric vector, the quality flag of variable to be filtered.
#' @param qc_val The numeric value of quality flag for _good_ data, other data
#' is set to missing.
#'
#' @details The quality flag will be applied to variable - unless the quality
#' flag variable is set to \code{NULL}.
#'
#' @return A numeric vector with _good_ data.
#'
#' @export
apply_qc <- function(x, qc_flag = NULL, qc_val = 0) {
# Check if specified columns exist and are numeric
check_input(x, "numeric")
name <- varnames(x)
if (!is.null(qc_flag)) {
check_input(qc_flag, "numeric_na")
# Check quality flag value
check_input(qc_val, "numeric")
# Only use data values when good data (quality flag is equal to flag value)
out <- dplyr::if_else(qc_flag > qc_val, NA_real_, x)
if (sum(!is.na(out)) == 0) {
warning(
name, "' has no data after applying qc flag with value ",
qc_val, ".", call. = FALSE
)
}
} else {
# Use all data
out <- x
if (sum(!is.na(out)) == 0) {
warning(name, "' has no data.", call. = FALSE)
}
}
# Add units
attr(out, "units") <- attr(x, "units")
attr(out, "varnames") <- dplyr::if_else(
is.null(x), name, paste0(name, "_", "qccor")
)
return(out)
}
## =============================================================================
#' Summarize Quality Control Results
#'
#' \code{summary_qc} is a function that summarizes quality checking results in a
#' form of table or plot.
#'
#' \code{summary_qc} loads a data frame \code{x}, extracts quality control
#' columns from it based on \code{qc_names} and creates a table (\code{plot =
#' FALSE}) or a plot (\code{plot = TRUE}) for these columns. Results are
#' displayed in percentage (\code{perc = TRUE}) or counts (\code{perc = FALSE})
#' of given flag per dataset.
#'
#' \code{cumul = TRUE} specifies that cumulative effect of flags is considered
#' for each halfhour. Note that for \code{cumul = TRUE} the results do depend on
#' the order of qc_names. The flags on each row are cumulatively summed from
#' left to right. \code{additive} is used only if cumul = TRUE, otherwise
#' skipped.
#'
#' Adapted from: openeddy - summarize_QC
#'
#' @return A table or a plot depending on the \code{plot} argument value.
#'
#' @param x A data frame with column names.
#' @param qc_names A vector of names of data frame columns to combine.
#' @param na.as A vector of numeric or \code{NA} values for the \code{qc_names}
#' subset of \code{x} that determines how should be the missing flags
#' interpreted. If only one value is provided, all columns are treated the
#' same way.
#' @param cumul A logical value that determines if cumulative (\code{cumul =
#' TRUE}) or individual (\code{cumul = FALSE}) effects of quality control
#' flags should be shown.
#' @param additive A vector of logical values (\code{TRUE} or \code{FALSE}) for
#' the \code{qc_names} subset of \code{x} that determines if the flags should
#' be treated as additive (\code{additive = TRUE}) or with fixed effect
#' (\code{additive = FALSE}). If only one value is provided, all columns are
#' considered to be of the same type.
#' @param plot A logical value. If \code{TRUE}, the results are plotted in the
#' active graphical device. If \code{FALSE}, they are represented as a table.
#' @param perc A logical value. If \code{TRUE}, the results are reported in
#' percentages. If \code{FALSE}, counts are used instead.
#' @param flux A character string. Used only if \code{plot = TRUE}. Includes the
#' flux name in the plot title to emphasize the relevance of displayed test
#' types.
#'
#' @export
summarize_qc <- function(x, var, na.as = NA, cumul = FALSE,
additive = FALSE, plot = FALSE, perc = TRUE,
flux = NULL) {
# Find flags that match _var_ OR _var (incl. deps)
x_names <- colnames(x)
deps <- get_deps()[[var]]
regex <- stringr::str_c("_", deps, "(_|\\b)", collapse = "|")
qc_names <- stringr::str_subset(x_names, regex)
# Check inputs
check_input(x, "data_frame")
check_input(qc_names, "character")
check_input(na.as, c("vector", "numeric_na"))
if (length(na.as) > 1) {
if (length(qc_names) != length(na.as)) {
stop("'na.as' must be of same length as 'qc_names' or length 1.")
}
} else if (length(na.as) != length(qc_names)) {
na.as <- rep(na.as, length(qc_names))
}
if (!all(qc_names %in% x_names)) {
stop(paste(
"Missing",
paste0(qc_names[!(qc_names %in% x_names)], collapse = ", "), "."
))
}
df <- x[1:nrow(x), qc_names]
if (!all(is.na(na.as))) {
for (i in seq_along(qc_names)) {
df[is.na(df[i]), i] <- na.as[i]
}
}
if (cumul) {
check_input(additive, c("vector", "logical", "all_finite"))
if (length(additive) > 1) {
if (length(qc_names) != length(additive)) {
stop("'additive' must be of same length as 'qc_names' or length 1.")
}
} else if (length(additive) != length(qc_names)) {
additive <- rep(additive, length(qc_names))
}
if (all(additive)) {
df <- as.data.frame(t(apply(df, 1, cumsum)))
} else if (!any(additive)) {
df <- as.data.frame(t(apply(df, 1, cummax)))
} else {
tmp <- df
tmp[2:ncol(df)] <- NA
for (i in 2:ncol(df)) {
tmp[i] <- if (additive[i]) {
rowSums(cbind(tmp[i - 1], df[i]))
} else {
apply(cbind(tmp[i - 1], df[i]), 1, max)
}
}
df <- tmp
}
}
df_m <- reshape2::melt(
df, id.vars = NULL, variable.name = "type", value.name = "flag"
)
if (perc) {
tab <- round(table(df_m, useNA = "ifany") / (nrow(df) / 100), 1)
} else tab <- table(df_m, useNA = "ifany")
if (!plot) {
return(tab)
} else {
tab_m <- reshape2::melt(tab, variable.name = "type")
if (cumul) {
high5 <- tab_m$flag > 5
high5[is.na(high5)] <- FALSE
tab_m[high5, "flag"] <- "6+"
}
tab_m$flag <- as.factor(paste("Flag", tab_m$flag, sep = "_"))
title <- ifelse(
cumul,
paste0(c(flux, "Cumulative effect of QC flags"), collapse = " - "),
paste0(c(flux, "Independent QC flags"), collapse = " - ")
)
y_label <- ifelse(perc, "Percentage", "Count")
ggplot2::ggplot(tab_m) +
ggplot2::aes(x = type, y = value, fill = flag) +
ggplot2::geom_bar(
stat = "identity", position = ggplot2::position_stack(reverse = TRUE)
) +
ggplot2::scale_fill_hue(guide = ggplot2::guide_legend(reverse = TRUE)) +
ggplot2::labs(title = title, y = y_label) +
ggplot2::theme(
plot.title = ggplot2::element_text(hjust = 0.5),
axis.text.x = ggplot2::element_text(angle = 20, vjust = 1, hjust = 1)
)
}
}
# TODO create "test_qc" function, which generates a qc flag and applies it to a
# variable of choice without adding it to the qc data frame
# option "plot = TRUE" to show a time series of the variable with points colored
# according to the tested flag
## =============================================================================
#' Write File Containing Quality Control Details
#'
#' @param data
#' @param siteyear
#' @param file_path
#'
#' @return
#' @export
#'
#' @examples
document_qc <- function(data, path) {
#session_time <- format(Sys.time(), "%Y-%m-%d")
#file_name <- paste0(siteyear, "_qc_details_", session_time, ".txt")
n <- length(data)
l <- list()
for (i in 1:n) {
l[[names(data)[i]]] <- attributes(data[, i])
}
capture.output(l, file = path)
# Show effect of flux interdependency test on QC flags:
# Interpretation:
# 0 shows how many flag 0 were corrected to flag 1,
# 1 shows overall change in flag 1
# 2 shows how many flag 1 were corrected to flag 2,
#abs(table(qc$qc_h_agg_inter) - table(qc$qc_h_agg_prelim))
#abs(table(qc$qc_le_agg_inter) - table(qc$qc_le_agg_prelim))
#abs(table(qc$qc_fco2_agg_inter) - table(qc$qc_fco2_agg_prelim))
}
## =============================================================================
#' Combine Quality Checking Results
#'
#' Combine quality checking results depending whether they have a fixed or
#' cumulative effect or any combination of these effects. It is also checked how
#' should \code{NA}s be interpreted.
#'
#' The quality checking results must be provided as a data frame with columns
#' containing quality flags resulting from individual tests/filters. For all
#' flags over all rows, the maximum and cumulative sum is taken for columns with
#' fixed and additive effect, respectively.
#'
#' Typical values of argument \code{na.as} are \code{NA}, \code{0} or \code{2}.
#' \code{NA} value is only formal and does not suggest any change in
#' interpretation. Value \code{0} is used in the case that the \code{NA} value
#' of the quality test/filter is an expected result and means that the half-hour
#' was not checked by or has a good quality according to the given test/filter
#' (e.g. \code{\link{despike}}). Value \code{2} would be used only if the user
#' wants to explicitly specify that \code{NA} flag value should not be used for
#' further analyses.
#'
#' @return A vector with attributes \code{varnames} and \code{units} is
#' produced. \code{varnames} value is set by \code{name_out} argument and
#' \code{units} value is set to default \code{"-"}.
#'
#' @param x A data frame with column names.
#' @param qc_names A vector of names of data frame columns to combine.
#' @param additive A vector of logical values (\code{TRUE} or \code{FALSE}) for
#' the \code{qc_names} subset of \code{x} that determines if the flags should
#' be treated as additive (\code{additive = TRUE}) or with fixed effect
#' (\code{additive = FALSE}). If only one value is provided, all columns are
#' considered to be of the same type. If \code{NULL} (default), values are
#' determined for each of specified \code{qc_names} from their attribute
#' \code{additive}.
#' @param na_as A vector of numeric or \code{NA} values for the \code{qc_names}
#' subset of \code{x} that determines how should be the missing flags
#' interpreted. If only one value is provided, all columns are treated the
#' same way. If \code{NULL} (default), values are determined for each of
#' specified \code{qc_names} from their attribute \code{na_as}.
#'
#' @export
combine_qc <- function(x, qc_names, additive = NULL, na_as = NULL) {
x_names <- colnames(x)
check_input(x, "data_frame")
check_input(qc_names, "character")
len <- length(qc_names)
# Set additive and na_as according to qc flag attributes
if (is.null(additive)) {
additive <- vector(mode = "logical", length = len)
for (i in 1:len) additive[i] <- attr(x[c(qc_names)][, i], "additive")
}
if (is.null(na_as)) {
na_as <- vector(mode = "numeric", length = len)
for (i in 1:len) na_as[i] <- attr(x[c(qc_names)][, i], "na_as")
}
check_input(
additive, c("logical", "vector", "all_finite", "lengths"), qc_names
)
check_input(na_as, c("vector", "numeric_na"))
if (length(na_as) > 1) {
if (length(qc_names) != length(na_as)) {
stop("'na_as' must be of same length as 'qc_names' or length 1.")
}
} else if (length(na_as) != length(qc_names)) {
na_as <- rep(na_as, length(qc_names))
}
if (!all(qc_names %in% x_names)) {
stop(paste(
"missing", paste0(qc_names[!(qc_names %in% x_names)], collapse = ", ")
))
}
df <- x[c(qc_names)]
if (length(qc_names) == 0) {
return(df)
}
if (!all(is.na(na_as))) {
for (i in seq_along(qc_names)) {
df[is.na(df[i]), i] <- na_as[i]
}
}
out <- df[, FALSE]
if (any(additive)) {
add <- df[additive]
add_all_na <- all_na(add, 1)
out$add[!add_all_na] <- rowSums(add[!add_all_na, , drop = FALSE])
}
if (any(!additive)) {
abs <- df[!additive]
abs_all_na <- all_na(abs, 1)
out$abs[!abs_all_na] <- apply(abs[!abs_all_na, , drop = FALSE], 1, max)
}
out_all_na <- all_na(out, 1)
out[!out_all_na, "name_out"] <- rowSums(out[!out_all_na, , drop = FALSE])
out[, "name_out"][out["name_out"] > 2] <- 2L
# Combined flags are not additive themselves
attr(out[, "name_out"], "additive") <- FALSE
attr(out[, "name_out"], "na_as") <- NA
attr(out[, "name_out"], "combined_flags") <- qc_names
return(out[, "name_out"])
}
combine_qc2 <- function(..., additive = NULL, na_as = NULL) {
# Vectorized form of combine_qc
# Capture dots
dots <- rlang::enexprs(...)
dots <- list(...)
df <- as.data.frame(dots)
len <- length(dots)
# Get additive and na_as attributes
if (is.null(additive)) {
additive <- vector(mode = "logical", length = len)
for (i in seq_along(dots)) additive[i] <- attr(df[[i]], "additive")
}
if (is.null(na_as)) {
na_as <- vector(mode = "numeric", length = len)
for (i in seq_along(dots)) na_as[i] <- attr(df[[i]], "na_as")
}
# Check inputs
check_input(additive, c("logical", "vector", "all_finite"))
check_input(na_as, c("vector", "numeric_na"))
if (length(na_as) > 1) {
if (len != length(na_as)) {
stop("'na_as' must be of same length as 'qc_names' or length 1.")
}
} else if (length(na_as) != len) {
na_as <- rep(na_as, len)
}
if (len == 0) {
stop("No qc flags were passed to combine.")
}
if (!all(is.na(na_as))) {
for (i in seq_along(dots)) {
df[is.na(df[i]), i] <- na_as[i]
}
}
# Combine flags
out <- df[, FALSE]
if (any(additive)) {
add <- df[additive]
add_all_na <- all_na(add, 1)
out$add[!add_all_na] <- rowSums(add[!add_all_na, , drop = FALSE])
}
if (any(!additive)) {
abs <- df[!additive]
abs_all_na <- all_na(abs, 1)
out$abs[!abs_all_na] <- apply(abs[!abs_all_na, , drop = FALSE], 1, max)
}
out_all_na <- all_na(out, 1)
out[!out_all_na, "x"] <- rowSums(out[!out_all_na, , drop = FALSE])
out[, "x"][out["x"] > 2] <- 2L
# Combined flags are not additive themselves
attr(out[, "x"], "additive") <- FALSE
attr(out[, "x"], "na_as") <- NA
attr(out[, "x"], "combined_flags") <- names(df)
out[, "x"]
}
## =============================================================================
#' Combine Quality Checking Results by Dependencies
#'
#' Wrapper function that simplifies combining many qc flags for flux variables.
#' Flags are assigned to variables based on dependencies as defined in
#' \link{\code{get_deps}}.
#'
#' @param data
#' @param var
#' @param suffix_out
#' @param suffix_spec
#' @param deps
#' @param do_all
#'
#' @return
#' @export
#'
#' @export
combine_qc_deps <- function(data, var = c(
"sa", "irga75", "irga77", "fco2", "fch4", "le", "h", "tau"
), suffix_out = "agg", suffix_spec = NULL, deps = get_deps(), do_all = FALSE) {
check_input(data, "data_frame")
check_input(suffix_out, "character")
var <- match.arg(var)
deps <- deps[[var]]
# find flags that match _var_ OR _var
regex <- stringr::str_c("_", deps, "(_|\\b)", collapse = "|")
qc_names <- stringr::str_subset(names(data), regex)
if (!is.null(suffix_spec)) {
qc_names <- stringr::str_subset(qc_names, suffix_spec)
}
#browser()
qc_subset <- data[, qc_names]
len <- length(qc_subset)
additive <- vector(mode = "logical", length = len)
na_as <- vector(mode = "numeric", length = len)
for (i in 1:len) {
additive[i] <- attr(qc_subset[, i], "additive")
na_as[i] <- attr(qc_subset[, i], "na_as")
}
name_out <- paste0("qc_", var, "_", suffix_out)
out <- data
out[, name_out] <- combine_qc(
x = data, qc_names = qc_names, additive = additive, na_as = na_as
)
# Combined flags are not additive themselves
attr(out, "additive") <- FALSE
attr(out, "na_as") <- NA
out
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.