## =============================================================================
## Check Functions
##
## Functions for checking compatability with other functions, providing minor
## adjustments to ease processing with other functions, and producing
## descriptive messages to diagnose and prevent internal errors.
## =============================================================================
## =============================================================================
#' Check a Data Frame Prior to Analysis
#'
#' Original name: sEddyProc_initialize
#'
#' @param site_name A string with the site identifier.
#' @param data A data frame with at least three month of (half-)hourly
#' site-level data.
#' @param vars An atomic vector of strings with selected column names. Tip: only
#' select columns that are used in processing. Less columns = faster
#' processing.
#' @param timestamp A string indicating the column name with POSIX timestamp.
#' @param dts An integer indicating the number of daily time steps (24 or 48).
#' @param char_cols Names of columns that should not be checked for numeric
#' type, e.g. season column.
#' @param lat An integer indicating the site latitude in decimal degrees (-90 to
#' +90).
#' @param long An integer indicating the site longitude in decimal degrees (-90
#' to +90).
#' @param tz An integer indicating the site time zone in offset from UTC, e.g.
#' -5 for U.S. Eastern time.
#'
#' @details
#' The timestamp must be provided in POSIX format, see also
#' \code{\link{convert_time}}. For required properties of the time series, see
#' \code{\link{check_times}}. Internally the half-hour timestamp is shifted to
#' the middle of the measurement period (minus 15 minutes or 30 minutes).
#'
#' All other columns may only contain numeric data. Please use NA as a gap flag
#' for missing data or low quality data not to be used in the processing. The
#' columns are also checked for plausibility with warnings if outside range.
#'
#' There are several attributes set within the data frame:
#' * site_name String for the site ID.
#' * total_records Number of data rows.
#' * daily_time_steps Number of daily time steps (24 or 48).
#' * start_year
#' * start_date
#' * end_year
#' * end_date
#' * latitude
#' * longitude
#' * time_zone
#'
#' @export
check_data <- function(data, site_name, vars, timestamp = "timestamp", dts = 48,
char_cols = character(0), lat = NA, long = NA, tz = NA) {
# Check entries
if (!check_value(site_name, "str") || is.na(site_name)) {
stop("site_name must be a character string.", call. = FALSE)
}
if (!is.na(lat) & (lat < -90 | lat > 90)) {
stop("lat must be in interval -90 to +90.", call. = FALSE)
}
if (!is.na(long) & (long < -180 | long > 180)) {
stop("long must be in interval -180 to +180.", call. = FALSE)
}
if (!is.na(tz) & (tz < -12 | tz > +12 | tz != as.integer(tz))) {
stop("tz must be an integer in interval -12 to +12.", call. = FALSE)
}
check_names(data, c(timestamp, vars))
# Check timestamp
check_times(data[, timestamp], dts = dts)
# Half-period time offset in seconds
time <- data[, timestamp] - (0.5 * 24 / dts * 60 * 60)
check_numeric(data, setdiff(vars, char_cols))
check_plausible(data, vars)
attr(data, "site_name") <- site_name
data <- cbind(timestamp = time, data[, vars, drop = FALSE])
# sTEMP <<- data.frame(timestamp = time)
# Initialize site data information from POSIX time stamp
attr(data, "total_records") <- length(data$timestamp)
attr(data, "daily_time_steps") <- dts
attr(data, "start_year") <- format(data$timestamp[1], "%Y")
attr(data, "start_date") <- format(data$timestamp[1], "%Y-%m-%d")
attr(data, "end_year") <- format(data$timestamp[length(data$timestamp)], "%Y")
attr(data, "end_date") <- format(data$timestamp[length(data$timestamp)],
"%Y-%m-%d")
attr(data, "latitude") <- lat
attr(data, "longitude") <- long
attr(data, "time_zone") <- tz
# Initialize class fields
message(
"Data successfully checked for site: ", site_name, ". \n",
"Metadata stored in data frame attributes."
)
return(data)
}
## =============================================================================
#' Check Function Inputs (Internal)
#'
#'
#' @param x
#' @param type
#' @param ...
#'
#' @details
#' Possible values for \code{type} (i.e. checking methods):
#' \code{c("all_finite", "atomic", "data_frame", "character", "length_1",
#' "length_2", "lengths", "logical", "no_dims", "numeric", "numeric_na",
#' "numeric_val", "posixt", "vector")}
#'
#' @return
#' @export
#'
#' @examples
#' @keywords internal
check_input <- function(x, type, y = NULL, action = c("stop", "warning"), ...) {
# TODO rewrite so that conditionals produce the messages only and stop or
# warning is called after everything is checked
action <- match.arg(action)
if ("all_finite" %in% type) {
if (anyNA(x)) {
stop(substitute(x), " must contain only finite values.", call. = FALSE)
}
}
if ("atomic" %in% type) {
if (!is.atomic(x)) {
stop(substitute(x), " must be of type 'atomic'.", call. = FALSE)
}
}
if ("data_frame" %in% type) {
if (!is.data.frame(x) || is.null(colnames(x))) {
stop(
substitute(x), " must be of class 'data.frame' with colnames.",
call. = FALSE
)
}
}
if ("character" %in% type) {
if (!is.character(x)) {
stop(substitute(x), " must be of class 'character'.", call. = FALSE)
}
}
if ("length_1" %in% type) {
if (length(x) != 1) {
stop(substitute(x), " must be of length 1.", call. = FALSE)
}
}
if ("length_2" %in% type) {
if (length(x) != 2) {
stop(substitute(x), " must be of length 2.", call. = FALSE)
}
}
if ("lengths" %in% type) {
if (length(x) != length(y) & length(x) > 1) {
stop(
substitute(x), " must have the same length as ", substitute(y),
" or a length of 1.", call. = FALSE
)
}
}
if ("logical" %in% type) {
if (!is.logical(x)) {
stop(substitute(x), " must be of type 'logical'.", call. = FALSE)
}
}
if ("no_dims" %in% type) {
if (!is.null(dim(x))) {
stop("'dim(", substitute(x), ")' must be NULL.", call. = FALSE)
}
}
if ("numeric" %in% type) {
if (!is.numeric(x)) {
stop(substitute(x), " must be of class 'numeric'.", call. = FALSE)
}
}
if ("numeric_na" %in% type) {
if (!is.null(x) && !is.numeric(x) && !all(is.na(x))) {
stop(substitute(x), " must contain numeric or NA values.", call. = FALSE)
}
}
if ("numeric_val" %in% type) {
if (!is.numeric(x) || length(x) != 1 || is.na(x)) {
stop(
substitute(x), " must be a non-missing numeric value.", call. = FALSE
)
}
}
if ("posixt" %in% type) {
if (!inherits(x, "POSIXt")) {
stop(substitute(x), " must be of class 'POSIXt'.", call. = FALSE)
}
}
if ("vector" %in% type) {
if (length(x) == 0) {
stop(
substitute(x), " must be a vector of non-zero length.", call. = FALSE
)
}
}
}
check_units <- function(x, type) {
units <- tidyflux::units(x)
if (units != type) {
warning(substitute(x), " units should be in ", type, ".", call. = FALSE)
}
}
## =============================================================================
#' Test Variables for Equal Length
#'
#' @param var_list List of variables for which the length has to be compared.
#'
#' @note This function only plays a role if no input data.frame or matrix are
#' provided. In this case it ensures that provided vectors have the same
#' length to avoid trouble later up the function call.
#'
#' @return
#'
#' @keywords internal
check_length <- function(var_list) {
if (is.list(unlist(var_list, recursive = FALSE))) {
var_list <- unlist(var_list, recursive = FALSE)
}
length.vars <- sapply(var_list, length)
length.vars <- length.vars[length.vars > 0]
if (length(unique(length.vars)) >= 2) {
if (sort(unique(length.vars))[1] != 1 | length(unique(length.vars)) > 2) {
stop(
"All input variables must have the same length or a length of 1.",
call. = FALSE
)
}
}
return(var_list)
}
## =============================================================================
#' Check Plausibility of Common Eddy Variables
#'
#' Check plausibility of common eddy variables.
#' Adapted from: REddyProc - fCheckColPlausibility.
#'
#' @param data A data frame.
#' @param vars Variable (column) names.
#'
#' @details
#' The following abbreviated variables are checked for plausibility:
#' * Rg Global radiation (W m-2).
#' * pot_rad Potential global radiation (W m-2).
#' * PPFD Photosynthetic active radiation (umol m-2 s-1).
#' * PAR Photosynthetic active radiation, (umol m-2 s-1).
#' * Tair Air temperature (degrees Celsius).
#' * Tsoil Soil temperature (degrees Celsius).
#' * VPD Vapour pressure deficit (hPa)
#' * RH Relative humidity (%)
#' * NEE Net Ecosystem Exchange (umol CO2 m-2 s-1 or g C m-2 day-1)
#' * ustar Friction velocity (m s-1)
#' * E_0 (deg K)
#' * air_pressure (Pa)
#' * Rn Net radiation (W m-2)
#' * P Precipitation (m)
#' * G Soil heat flux (W m-2)
#' * SWC Soil water content (mg mg-1)
#'
#' @return Warnings are produced if a variable is outside plausible range.
#'
check_plausible <- function(data, vars, return_pos = FALSE) {
# Check column names
check_names(data, vars)
# Separated checks for upper and lower limit to have separate warnings
# Create index of plausability limits
index <- list(
"Rg" = c(0, 1200), "pot_rad" = c(0, 3000), "PPFD" = c(0, 2200),
"PAR" = c(0, 2500), "Tair" = c(-50, 50), "Tsoil" = c(-20, 50),
"VPD" = c(0, 50), "RH" = c(0, 100), "NEE" = c(-50, 100),
"ustar" = c(-1, 50), "E_0" = c(0, 600), "air_pressure" = c(70000, 120000),
"Rn" = c(-200, 1000), "P" = c(0, 0.05), "G" = c(-110, 220),
"SWC" = c(0, 0.7)
)
vars <- vars[which(vars %in% names(index))]
out_list <- list()
for (i in seq_along(vars)) {
var <- vars[i]
range <- index[[var]]
if (is.null(range)) next
low <- check_range(data, var, c("<", range[1]), return_pos = return_pos)
high <- check_range(data, var, c(">", range[2]), return_pos = return_pos)
if (return_pos) {
out <- unique(c(which(low == TRUE), which(high == TRUE)))
out_list[[var]] <- out
}
}
if (return_pos) {
return(out_list)
} else {
message("All variables are within plausible range.")
}
}
## =============================================================================
#' Check Half-hourly Time Series Data
#'
#' Check half-hourly time series data conformity with a one-year regular
#' sequence.
#' Adapted from: REddyProc - fCheckHHTimeSeries.
#'
#' @param timestamp A time vector in POSIX format.
#' @param dts The number of daily time steps (24 or 48).
#'
#' @details
#' The number of steps per day can be 24 (hourly) or 48 (half-hourly).
#' The timestamp must be provided in POSIX time format, with equidistant
#' half-hours and stamped on the half hour. Many fluxr procedures require at
#' least three months of data. Full days of data are preferred: the total amount
#' of data rows should be a multiple of the daily time step, and in accordance
#' with FLUXNET standards, the dataset is spanning from the end of the first
#' (half-)hour (0:30 or 1:00, respectively) and to midnight (0:00).
#'
#' @return Produces warnings and stops on errors.
#'
#' @export
check_times <- function(timestamp, dts = 48) {
if (dts != 24 && dts != 48) {
stop(
"Daily time step must be hourly (24) or half-hourly (48). The following
value is not valid: ", dts, ".",
call. = FALSE)
}
if (!inherits(timestamp, "POSIXt")) {
stop("Timestamp data not in POSIX format.", call. = FALSE)
}
len <- length(timestamp)
dist <- 24 / dts * 3600
dist1 <- as.numeric(timestamp[2:len])
dist2 <- as.numeric(timestamp[1:len - 1])
not_dist <- (dist1 - dist2) != dist
not_dists <- sum(not_dist)
if (not_dist > 0) {
stop(
"Timestamp must contain equidistant half-hours: \n",
"Check records ", paste(which(not_dist), collapse = ", "), ".",
call. = FALSE
)
}
not_hh <- sum(as.numeric(timestamp) %% dist != 0)
if (not_hh > 0) {
stop(
"Timestamp must be at half-hours: \n",
"Check records ", which(as.numeric(timestamp) %% dist != 0), ".",
call. = FALSE
)
}
if (len < (90 * dts)) {
stop(
"Time series must contain at least 90 days of data: \n",
"Data provided is missing ", 90 - len / dts, "days.",
call. = FALSE
)
}
resid <- len %% dts
if (resid != 0) {
warning(
"Data should be provided in multiple of daily time step: \n",
"One day only has ", resid, " records.",
call. = FALSE
)
}
hour1 <- as.POSIXlt(timestamp[1])$hour
min1 <- as.POSIXlt(timestamp)$min
hm_format <- format(timestamp[1], "%H:%M")
if (dts == 48 && !(hour1 == 0 && min1 == 30)) {
warning(
"Timestamp of first record not at end of first half-hour: ", hm_format,
" instead of 00:30.",
call. = FALSE
)
}
if (dts == 24 && !(hour1 == 1 && min1 == 00)) {
warning(
"Timestamp of first record not at end of first hour: ", hm_format,
" instead of 01:00.",
call. = FALSE
)
}
last <- as.POSIXlt(timestamp[len])
if (!(last$hour == 0 && last$min == 0)) {
warning("Last timestamp is not midnight: 0:00.", call. = FALSE)
}
}
## =============================================================================
#' Check if Variable is Numeric or Character String
#'
#' Check if variable is numeric or a non-empty character string.
#' Adapted from: REddyProc - fCheckValString & fCheckValNum.
#'
#' @param x A value to be checked (can also be NA of any type).
#' @param val The type of variable to check against \code{x}. Can be numeric
#' "num" or string "str".
#'
#' @return Boolean value if TRUE or FALSE.
#'
check_value <- function(x, val = c("num", "str")) {
val <- match.arg(val)
if (val == "num") {
if (length(x) == 0) {
out <- FALSE
} else if (!is.na(x) && !is.numeric(x)) {
out <- FALSE
} else {
out <- TRUE
}
} else if (val == "str") {
if (length(x) == 0) {
out <- FALSE
} else if (!is.na(x) && (!is.character(x) || !nzchar(x))) {
out <- FALSE
} else {
out <- TRUE
}
}
return(out)
}
## =============================================================================
#' Check Variable Against Specified Range
#'
#' Check if specified variable is outside of provided boundaries.
#' Original name: fCheckOutsideRange.
#'
#' @param data A data frame.
#' @param var A variable (column) name.
#' @param cond A logical condition for outside values (see details).
#' @param return_pos Should a vector cases outside the range also be returned?
#'
#' @details
#' Example of condition structure: c(' <= ', 0) or c(' <= ', 0, '|', '>', 20)
#' Allowed relational operators: < <= == >= > !=
#' Allowed logical operators: & |
#'
#' @return Warnings are produced if a variable is outside specified range. If
#' return_pos = TRUE, a vector containing the positions of cases that are
#' outside the range is also returned.
#'
check_range <- function(data, var, cond, return_pos = FALSE) {
check_names(data, var)
check_numeric(data, var)
x <- data[, var]
# Check condition
conds <- c("<", " <= ", " == ", " >= ", ">", " != ")
cond_text <-
if (length(cond) == 2 && cond[1] %in% conds && nzchar(cond[2])) {
# One condition
paste("x ", cond[1], " ", cond[2], " & !is.na(x)", sep = "")
} else if (length(cond) == 5 && all(cond[c(1, 4)] %in% conds) &&
all(nzchar(cond[2]), nzchar(cond[5])) &&
(cond[3] %in% c("|", "&"))) {
# Two conditions
paste("(x ", cond[1], " ", cond[2], " ", cond[3], " x ", cond[4],
" ", cond[5], ") & !is.na(x)",
sep = ""
)
} else {
stop(
"Incorrect condition definition: ", paste(cond, collapse = " "),
call. = FALSE
)
}
# Warning message
out_b <- eval(parse(text = cond_text))
out_n <- sum(out_b)
if (out_n > 0) {
warning(
"Variable outside range in ", out_n,
" cases. Invalid values with ", var, " ",
paste(cond, collapse = " "), ": ",
paste(format(x[out_b][1:(min(out_n, 50))], digits = 2),
collapse = ", "
), " ...",
call. = FALSE
)
}
if (return_pos) {
return(out_b)
} else return(invisible(NULL))
}
## =============================================================================
#' Check if Column Exists in Data Frame
#'
#' Check if specified columns exist in data frame. Original name: fCheckColNames
#'
#' @param data A data frame.
#' @param colnames A vector of variables to be checked.
#'
#' @return Stops on errors.
#'
check_names <- function(data, colnames = "none") {
# Exclude dummy "none"
none <- colnames %in% "none"
# Check if specified columns exist in data frame
names <- colnames[!none] %in% colnames(data)
if (!all(names)) {
colnames <- paste(colnames[!none][!names], collapse = ", ", sep = "")
stop(
"Missing specified columns in dataset:",
colnames,
call. = FALSE
)
}
}
## =============================================================================
#' Check if Column is Numeric
#'
#' Check if specified columns are numeric. Original name: fCheckColNum.
#'
#' @param data A data frame.
#' @param colnames A vector of variables to be checked, with "none" as dummy.
#'
#' @return Stops on errors.
#'
check_numeric <- function(data, colnames = "none") {
# Exclude dummy "none"
nonecols <- colnames %in% "none"
# Check if specified columns are numeric
numcols <- sapply(data[, colnames[!nonecols]], is.numeric)
if (!all(numcols) ) {
colnames <- paste(colnames[!nonecols][!numcols], collapse = ", ")
stop("The following columns in dataset are non-numeric: ", colnames)
}
}
## =============================================================================
#' Check whether string, list or vector is empty
#'
#' This function checks whether a string or character vector (of length 1), a
#' list or any vector (numeric, atomic) is empty or not.
#' From sjmisc.
#'
#' @param x String, character vector, list, data.frame or numeric vector or
#' factor.
#' @param first.only Logical, if \code{FALSE} and \code{x} is a character
#' vector, each element of \code{x} will be checked if empty. If \code{TRUE},
#' only the first element of \code{x} will be checked.
#' @return Logical, \code{TRUE} if \code{x} is a character vector or string and
#' is empty, \code{TRUE} if \code{x} is a vector or list and of length 0,
#' \code{FALSE} otherwise.
#'
#' @note \code{NULL}- or \code{NA}-values are also considered as "empty" (see
#' 'Examples') and will return \code{TRUE}.
#'
#' @importFrom purrr compact
#' @export
is_empty <- function(x, first.only = TRUE) {
# do we have a valid vector?
if (!is.null(x)) {
# if it's a character, check if we have only one element in that vector
if (is.character(x)) {
# characters may also be of length 0
if (length(x) == 0) return(TRUE)
# else, check all elements of x
zero_len <- nchar(x) == 0
# return result for multiple elements of character vector
if (first.only) {
zero_len <- .is_true(zero_len[1])
if (length(x) > 0) x <- x[!is.na(x)][1]
} else {
return(unname(zero_len))
}
# we have a non-character vector here. check for length
} else if (is.list(x)) {
x <- purrr::compact(x)
zero_len <- length(x) == 0
} else {
zero_len <- length(x) == 0
}
}
any(is.null(x) || zero_len || all(is.na(x)))
}
#' from sjmisc
#' @importFrom dplyr quos select
get_dot_data <- function(x, qs) {
if (is_empty(qs))
x
else
suppressMessages(dplyr::select(x, !!!qs))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.