tidyflux:

## =============================================================================
## Check Functions
##
## Functions for checking compatability with other functions, providing minor
## adjustments to ease processing with other functions, and producing
## descriptive messages to diagnose and prevent internal errors.
## =============================================================================


## =============================================================================
#' Check a Data Frame Prior to Analysis
#'
#' Original name: sEddyProc_initialize
#'
#' @param site_name A string with the site identifier.
#' @param data A data frame with at least three month of (half-)hourly
#'   site-level data.
#' @param vars An atomic vector of strings with selected column names. Tip: only
#'   select columns that are used in processing. Less columns = faster
#'   processing.
#' @param timestamp A string indicating the column name with POSIX timestamp.
#' @param dts An integer indicating the number of daily time steps (24 or 48).
#' @param char_cols Names of columns that should not be checked for numeric
#'   type, e.g. season column.
#' @param lat An integer indicating the site latitude in decimal degrees (-90 to
#'   +90).
#' @param long An integer indicating the site longitude in decimal degrees (-90
#'   to +90).
#' @param tz An integer indicating the site time zone in offset from UTC, e.g.
#'   -5 for U.S. Eastern time.
#'
#' @details
#' The timestamp must be provided in POSIX format, see also
#' \code{\link{convert_time}}. For required properties of the time series, see
#' \code{\link{check_times}}. Internally the half-hour timestamp is shifted to
#' the middle of the measurement period (minus 15 minutes or 30 minutes).
#'
#' All other columns may only contain numeric data. Please use NA as a gap flag
#' for missing data or low quality data not to be used in the processing. The
#' columns are also checked for plausibility with warnings if outside range.
#'
#' There are several attributes set within the data frame:
#' * site_name String for the site ID.
#' * total_records Number of data rows.
#' * daily_time_steps Number of daily time steps (24 or 48).
#' * start_year
#' * start_date
#' * end_year
#' * end_date
#' * latitude
#' * longitude
#' * time_zone
#'
#' @export
check_data <- function(data, site_name, vars, timestamp = "timestamp", dts = 48,
                       char_cols = character(0), lat = NA, long = NA, tz = NA) {
  # Check entries
  if (!check_value(site_name, "str") || is.na(site_name)) {
    stop("site_name must be a character string.", call. = FALSE)
  }
  if (!is.na(lat) & (lat < -90 | lat > 90)) {
    stop("lat must be in interval -90 to +90.", call. = FALSE)
  }
  if (!is.na(long) & (long < -180 | long > 180)) {
    stop("long must be in interval -180 to +180.", call. = FALSE)
  }
  if (!is.na(tz) & (tz < -12 | tz > +12 | tz != as.integer(tz))) {
    stop("tz must be an integer in interval -12 to +12.", call. = FALSE)
  }
  check_names(data, c(timestamp, vars))
  # Check timestamp
  check_times(data[, timestamp], dts = dts)
  # Half-period time offset in seconds
  time <- data[, timestamp] - (0.5 * 24 / dts * 60 * 60)
  check_numeric(data, setdiff(vars, char_cols))
  check_plausible(data, vars)
  attr(data, "site_name") <- site_name
  data <- cbind(timestamp = time, data[, vars, drop = FALSE])
  # sTEMP <<- data.frame(timestamp = time)
  # Initialize site data information from POSIX time stamp
  attr(data, "total_records") <- length(data$timestamp)
  attr(data, "daily_time_steps") <- dts
  attr(data, "start_year") <- format(data$timestamp[1], "%Y")
  attr(data, "start_date") <- format(data$timestamp[1], "%Y-%m-%d")
  attr(data, "end_year") <- format(data$timestamp[length(data$timestamp)], "%Y")
  attr(data, "end_date") <- format(data$timestamp[length(data$timestamp)],
                                   "%Y-%m-%d")
  attr(data, "latitude") <- lat
  attr(data, "longitude") <- long
  attr(data, "time_zone") <- tz
  # Initialize class fields
  message(
    "Data successfully checked for site: ", site_name, ". \n",
    "Metadata stored in data frame attributes."
  )
  return(data)
}

## =============================================================================
#' Check Function Inputs (Internal)
#'
#'
#' @param x
#' @param type
#' @param ...
#'
#' @details
#' Possible values for \code{type} (i.e. checking methods):
#' \code{c("all_finite", "atomic", "data_frame", "character", "length_1",
#' "length_2", "lengths", "logical", "no_dims", "numeric", "numeric_na",
#' "numeric_val", "posixt", "vector")}
#'
#' @return
#' @export
#'
#' @examples
#' @keywords internal
check_input <- function(x, type, y = NULL, action = c("stop", "warning"), ...) {
  # TODO rewrite so that conditionals produce the messages only and stop or
  # warning is called after everything is checked
  action <- match.arg(action)
  if ("all_finite" %in% type) {
    if (anyNA(x)) {
      stop(substitute(x), " must contain only finite values.", call. = FALSE)
    }
  }
  if ("atomic" %in% type) {
    if (!is.atomic(x)) {
      stop(substitute(x), " must be of type 'atomic'.", call. = FALSE)
    }
  }
  if ("data_frame" %in% type) {
    if (!is.data.frame(x) || is.null(colnames(x))) {
      stop(
        substitute(x), " must be of class 'data.frame' with colnames.",
        call. = FALSE
      )
    }
  }
  if ("character" %in% type) {
    if (!is.character(x)) {
      stop(substitute(x), " must be of class 'character'.", call. = FALSE)
    }
  }
  if ("length_1" %in% type) {
    if (length(x) != 1) {
      stop(substitute(x), " must be of length 1.", call. = FALSE)
    }
  }
  if ("length_2" %in% type) {
    if (length(x) != 2) {
      stop(substitute(x), " must be of length 2.", call. = FALSE)
    }
  }
  if ("lengths" %in% type) {
    if (length(x) != length(y) & length(x) > 1) {
      stop(
        substitute(x), " must have the same length as ", substitute(y),
        " or a length of 1.", call. = FALSE
      )
    }
  }
  if ("logical" %in% type) {
    if (!is.logical(x)) {
      stop(substitute(x), " must be of type 'logical'.", call. = FALSE)
    }
  }
  if ("no_dims" %in% type) {
    if (!is.null(dim(x))) {
      stop("'dim(", substitute(x), ")' must be NULL.", call. = FALSE)
    }
  }
  if ("numeric" %in% type) {
    if (!is.numeric(x)) {
      stop(substitute(x), " must be of class 'numeric'.", call. = FALSE)
    }
  }
  if ("numeric_na" %in% type) {
    if (!is.null(x) && !is.numeric(x) && !all(is.na(x))) {
      stop(substitute(x), " must contain numeric or NA values.", call. = FALSE)
    }
  }
  if ("numeric_val" %in% type) {
    if (!is.numeric(x) || length(x) != 1 || is.na(x)) {
      stop(
        substitute(x), " must be a non-missing numeric value.", call. = FALSE
      )
    }
  }
  if ("posixt" %in% type) {
    if (!inherits(x, "POSIXt")) {
      stop(substitute(x), " must be of class 'POSIXt'.", call. = FALSE)
    }
  }
  if ("vector" %in% type) {
    if (length(x) == 0) {
      stop(
        substitute(x), " must be a vector of non-zero length.", call. = FALSE
      )
    }
  }
}

check_units <- function(x, type) {
  units <- tidyflux::units(x)
  if (units != type) {
    warning(substitute(x), " units should be in ", type, ".", call. = FALSE)
  }
}

## =============================================================================
#' Test Variables for Equal Length
#'
#' @param var_list List of variables for which the length has to be compared.
#'
#' @note This function only plays a role if no input data.frame or matrix are
#'   provided. In this case it ensures that provided vectors have the same
#'   length to avoid trouble later up the function call.
#'
#' @return
#'
#' @keywords internal
check_length <- function(var_list) {
  if (is.list(unlist(var_list, recursive = FALSE))) {
    var_list <- unlist(var_list, recursive = FALSE)
  }
  length.vars <- sapply(var_list, length)
  length.vars <- length.vars[length.vars > 0]
  if (length(unique(length.vars)) >= 2) {
    if (sort(unique(length.vars))[1] != 1 | length(unique(length.vars)) > 2) {
      stop(
        "All input variables must have the same length or a length of 1.",
        call. = FALSE
      )
    }
  }
  return(var_list)
}

## =============================================================================
#' Check Plausibility of Common Eddy Variables
#'
#' Check plausibility of common eddy variables.
#' Adapted from: REddyProc - fCheckColPlausibility.
#'
#' @param data A data frame.
#' @param vars Variable (column) names.
#'
#' @details
#' The following abbreviated variables are checked for plausibility:
#' * Rg Global radiation (W m-2).
#' * pot_rad Potential global radiation (W m-2).
#' * PPFD Photosynthetic active radiation (umol m-2 s-1).
#' * PAR Photosynthetic active radiation, (umol m-2 s-1).
#' * Tair Air temperature (degrees Celsius).
#' * Tsoil Soil temperature (degrees Celsius).
#' * VPD Vapour pressure deficit (hPa)
#' * RH Relative humidity (%)
#' * NEE Net Ecosystem Exchange (umol CO2 m-2 s-1 or g C m-2 day-1)
#' * ustar Friction velocity (m s-1)
#' * E_0 (deg K)
#' * air_pressure (Pa)
#' * Rn Net radiation (W m-2)
#' * P Precipitation (m)
#' * G Soil heat flux (W m-2)
#' * SWC Soil water content (mg mg-1)
#'
#' @return Warnings are produced if a variable is outside plausible range.
#'
check_plausible <- function(data, vars, return_pos = FALSE) {
  # Check column names
  check_names(data, vars)
  # Separated checks for upper and lower limit to have separate warnings
  # Create index of plausability limits
  index <- list(
    "Rg" = c(0, 1200), "pot_rad" = c(0, 3000), "PPFD" = c(0, 2200),
    "PAR" = c(0, 2500), "Tair" = c(-50, 50), "Tsoil" = c(-20, 50),
    "VPD" = c(0, 50), "RH" = c(0, 100), "NEE" = c(-50, 100),
    "ustar" = c(-1, 50), "E_0" = c(0, 600), "air_pressure" = c(70000, 120000),
    "Rn" = c(-200, 1000), "P" = c(0, 0.05), "G" = c(-110, 220),
    "SWC" = c(0, 0.7)
  )
  vars <- vars[which(vars %in% names(index))]
  out_list <- list()
  for (i in seq_along(vars)) {
    var <- vars[i]
    range <- index[[var]]
    if (is.null(range)) next
    low <- check_range(data, var, c("<", range[1]), return_pos = return_pos)
    high <- check_range(data, var, c(">", range[2]), return_pos = return_pos)
    if (return_pos) {
      out <- unique(c(which(low == TRUE), which(high == TRUE)))
      out_list[[var]] <- out
    }
  }
  if (return_pos) {
    return(out_list)
  } else {
    message("All variables are within plausible range.")
  }
}

## =============================================================================
#' Check Half-hourly Time Series Data
#'
#' Check half-hourly time series data conformity with a one-year regular
#' sequence.
#' Adapted from: REddyProc - fCheckHHTimeSeries.
#'
#' @param timestamp A time vector in POSIX format.
#' @param dts The number of daily time steps (24 or 48).
#'
#' @details
#' The number of steps per day can be 24 (hourly) or 48 (half-hourly).
#' The timestamp must be provided in POSIX time format, with equidistant
#' half-hours and stamped on the half hour. Many fluxr procedures require at
#' least three months of data. Full days of data are preferred: the total amount
#' of data rows should be a multiple of the daily time step, and in accordance
#' with FLUXNET standards, the dataset is spanning from the end of the first
#' (half-)hour (0:30 or 1:00, respectively) and to midnight (0:00).
#'
#' @return Produces warnings and stops on errors.
#'
#' @export
check_times <- function(timestamp, dts = 48) {
  if (dts != 24 && dts != 48) {
    stop(
      "Daily time step must be hourly (24) or half-hourly (48). The following
      value is not valid: ", dts, ".",
      call. = FALSE)
  }
  if (!inherits(timestamp, "POSIXt")) {
    stop("Timestamp data not in POSIX format.", call. = FALSE)
  }
  len <- length(timestamp)
  dist <- 24 / dts * 3600
  dist1 <- as.numeric(timestamp[2:len])
  dist2 <- as.numeric(timestamp[1:len - 1])
  not_dist <- (dist1 - dist2) != dist
  not_dists <- sum(not_dist)
  if (not_dist > 0) {
    stop(
      "Timestamp must contain equidistant half-hours: \n",
      "Check records ", paste(which(not_dist), collapse = ", "), ".",
      call. = FALSE
    )
  }
  not_hh <- sum(as.numeric(timestamp) %% dist != 0)
  if (not_hh > 0) {
    stop(
      "Timestamp must be at half-hours: \n",
      "Check records ", which(as.numeric(timestamp) %% dist != 0), ".",
      call. = FALSE
    )
  }
  if (len < (90 * dts)) {
    stop(
      "Time series must contain at least 90 days of data: \n",
      "Data provided is missing ", 90 - len / dts, "days.",
      call. = FALSE
    )
  }
  resid <- len %% dts
  if (resid != 0) {
    warning(
      "Data should be provided in multiple of daily time step: \n",
      "One day only has ", resid, " records.",
      call. = FALSE
    )
  }
  hour1 <- as.POSIXlt(timestamp[1])$hour
  min1 <- as.POSIXlt(timestamp)$min
  hm_format <- format(timestamp[1], "%H:%M")
  if (dts == 48 && !(hour1 == 0 && min1 == 30)) {
    warning(
      "Timestamp of first record not at end of first half-hour: ", hm_format,
      " instead of 00:30.",
      call. = FALSE
    )
  }
  if (dts == 24 && !(hour1 == 1 && min1 == 00)) {
    warning(
      "Timestamp of first record not at end of first hour: ", hm_format,
      " instead of 01:00.",
      call. = FALSE
    )
  }
  last <- as.POSIXlt(timestamp[len])
  if (!(last$hour == 0 && last$min == 0)) {
    warning("Last timestamp is not midnight: 0:00.", call. = FALSE)
  }
}

## =============================================================================
#' Check if Variable is Numeric or Character String
#'
#' Check if variable is numeric or a non-empty character string.
#' Adapted from: REddyProc - fCheckValString & fCheckValNum.
#'
#' @param x A value to be checked (can also be NA of any type).
#' @param val The type of variable to check against \code{x}. Can be numeric
#'   "num" or string "str".
#'
#' @return Boolean value if TRUE or FALSE.
#'
check_value <- function(x, val = c("num", "str")) {
  val <- match.arg(val)
  if (val == "num") {
    if (length(x) == 0) {
      out <- FALSE
    } else if (!is.na(x) && !is.numeric(x)) {
      out <- FALSE
    } else {
      out <- TRUE
    }
  } else if (val == "str") {
    if (length(x) == 0) {
      out <- FALSE
    } else if (!is.na(x) && (!is.character(x) || !nzchar(x))) {
      out <- FALSE
    } else {
      out <- TRUE
    }
  }
  return(out)
}

## =============================================================================
#' Check Variable Against Specified Range
#'
#' Check if specified variable is outside of provided boundaries.
#' Original name: fCheckOutsideRange.
#'
#' @param data A data frame.
#' @param var A variable (column) name.
#' @param cond A logical condition for outside values (see details).
#' @param return_pos Should a vector cases outside the range also be returned?
#'
#' @details
#' Example of condition structure: c(' <= ', 0) or c(' <= ', 0, '|', '>', 20)
#' Allowed relational operators: < <= == >= > !=
#' Allowed logical operators: & |
#'
#' @return Warnings are produced if a variable is outside specified range. If
#'   return_pos = TRUE, a vector containing the positions of cases that are
#'   outside the range is also returned.
#'
check_range <- function(data, var, cond, return_pos = FALSE) {
  check_names(data, var)
  check_numeric(data, var)
  x <- data[, var]
  # Check condition
  conds <- c("<", " <= ", " == ", " >= ", ">", " != ")
  cond_text <-
    if (length(cond) == 2 && cond[1] %in% conds && nzchar(cond[2])) {
      # One condition
      paste("x ", cond[1], " ", cond[2], " & !is.na(x)", sep = "")
    } else if (length(cond) == 5 && all(cond[c(1, 4)] %in% conds) &&
      all(nzchar(cond[2]), nzchar(cond[5])) &&
      (cond[3] %in% c("|", "&"))) {
      # Two conditions
      paste("(x ", cond[1], " ", cond[2], "  ", cond[3], " x ", cond[4],
        " ", cond[5], ") & !is.na(x)",
        sep = ""
      )
    } else {
      stop(
        "Incorrect condition definition: ", paste(cond, collapse = " "),
        call. = FALSE
      )
    }
  # Warning message
  out_b <- eval(parse(text = cond_text))
  out_n <- sum(out_b)
  if (out_n > 0) {
    warning(
      "Variable outside range in ", out_n,
      " cases. Invalid values with ", var, " ",
      paste(cond, collapse = " "), ": ",
      paste(format(x[out_b][1:(min(out_n, 50))], digits = 2),
        collapse = ", "
      ), " ...",
      call. = FALSE
    )
  }
  if (return_pos) {
    return(out_b)
  } else return(invisible(NULL))
}

## =============================================================================
#' Check if Column Exists in Data Frame
#'
#' Check if specified columns exist in data frame. Original name: fCheckColNames
#'
#' @param data A data frame.
#' @param colnames A vector of variables to be checked.
#'
#' @return Stops on errors.
#'
check_names <- function(data, colnames = "none") {
  # Exclude dummy "none"
  none <- colnames %in% "none"
  # Check if specified columns exist in data frame
  names <- colnames[!none] %in% colnames(data)
  if (!all(names)) {
    colnames <- paste(colnames[!none][!names], collapse = ", ", sep = "")
    stop(
      "Missing specified columns in dataset:",
      colnames,
      call. = FALSE
    )
  }
}

## =============================================================================
#' Check if Column is Numeric
#'
#' Check if specified columns are numeric. Original name: fCheckColNum.
#'
#' @param data A data frame.
#' @param colnames A vector of variables to be checked, with "none" as dummy.
#'
#' @return Stops on errors.
#'
check_numeric <- function(data, colnames = "none") {
  # Exclude dummy "none"
  nonecols <- colnames %in% "none"
  # Check if specified columns are numeric
  numcols <- sapply(data[, colnames[!nonecols]], is.numeric)
  if (!all(numcols) ) {
    colnames <- paste(colnames[!nonecols][!numcols], collapse = ", ")
    stop("The following columns in dataset are non-numeric: ", colnames)
  }
}

## =============================================================================
#' Check whether string, list or vector is empty
#'
#' This function checks whether a string or character vector (of length 1), a
#' list or any vector (numeric, atomic) is empty or not.
#' From sjmisc.
#'
#' @param x String, character vector, list, data.frame or numeric vector or
#'   factor.
#' @param first.only Logical, if \code{FALSE} and \code{x} is a character
#'   vector, each element of \code{x} will be checked if empty. If \code{TRUE},
#'   only the first element of \code{x} will be checked.
#' @return Logical, \code{TRUE} if \code{x} is a character vector or string and
#'   is empty, \code{TRUE} if \code{x} is a vector or list and of length 0,
#'   \code{FALSE} otherwise.
#'
#' @note \code{NULL}- or \code{NA}-values are also considered as "empty" (see
#'   'Examples') and will return \code{TRUE}.
#'
#' @importFrom purrr compact
#' @export
is_empty <- function(x, first.only = TRUE) {
  # do we have a valid vector?
  if (!is.null(x)) {
    # if it's a character, check if we have only one element in that vector
    if (is.character(x)) {
      # characters may also be of length 0
      if (length(x) == 0) return(TRUE)
      # else, check all elements of x
      zero_len <- nchar(x) == 0
      # return result for multiple elements of character vector
      if (first.only) {
        zero_len <- .is_true(zero_len[1])
        if (length(x) > 0) x <- x[!is.na(x)][1]
      } else {
        return(unname(zero_len))
      }
      # we have a non-character vector here. check for length
    } else if (is.list(x)) {
      x <- purrr::compact(x)
      zero_len <- length(x) == 0
    } else {
      zero_len <- length(x) == 0
    }
  }
  any(is.null(x) || zero_len || all(is.na(x)))
}

#' from sjmisc
#' @importFrom dplyr quos select
get_dot_data <- function(x, qs) {
  if (is_empty(qs))
    x
  else
    suppressMessages(dplyr::select(x, !!!qs))
}