R/data_management.R

## =============================================================================
#' Set Up Folder Structure
#'
#' Folder structure recommended for eddy covariance data processing and
#' archivation.
#'
#' The purpose is to standardize the locations for metadata and post-processing
#' inputs required to run the proposed workflow as well as to store data and
#' metadata in levels corresponding to processing stage. The folder structure is
#' not required to succesfully apply the workflow but simplifies its use.
#' Adapted from: openeddy - structure_dirs
#'
#' Data processing stages:
#' \itemize{
#' \item Level 0: Raw files with measured high frequency eddy covariance data
#'   and relevant metadata or instrument setup files.
#' \item Level 1: Processing software setup and output files and a logbook.
#' \item Level 2: Quality checking results and documentation, definition of
#'   ecosystem boundary, storage flux processing and files used as inputs for
#'   Level 3 data.
#' \item Level 3: gap-filling output and its documentation, summary of the
#'   computed fluxes and meteorological data including their aggregation.
#' }
#'
#' @param root A character string defining the root of created folder structure.
#' @param create_dirs A logical value. Indicates whether directories should be
#'  created.
#' @param ... Further arguments to be passed to \code{dir.create} function.
#'
#' @return A named list with paths to folder structure directories.
#'  Corresponding directories are created as a function side effect if
#'  \code{create_dirs = TRUE} (default).
#'
#' @export
structure_dirs <- function(root = ".", create_dirs = FALSE, ...) {
  # With dir.create(recursive = TRUE, ...) all paths not needed to create dirs
  # but needed in order to make the dir accessible with path in the list
  l <- list(
    logbook = "/logbook/log.txt",
    level0 = "/level_0/",
    eddypro_setup = "/level_0/eddypro_setup/",
    site_setup = "/level_0/site_setup/",
    raw_data = "/level_0/raw_data/",
    level1 = "/level_1/",
    post_proc = "/level_1/post_processing/",
    eddypro_output = "/level_1/post_processing/eddypro_output_data/",
    combined_output = "/level_1/post_processing/combined_output_data/",
    prelim_data = "/level_1/post_processing/prelim_data/",
    prelim_reports = "/level_1/prelim_reports/",
    qc_input = "/level_1/qc_input_data/",
    footprint = "/level_1/footprint/",
    fp_halfhour = "/level_1/footprint/halfhourly/",
    fp_grid = "/level_1/footprint/grid/",
    fp_average = "/level_1/footprint/average/",
    level2 = "/level_2/",
    quality_control = "/level_2/quality_control/",
    precheck = "/level_2/quality_control/precheck/",
    qc_output = "/level_2/quality_control/qc_output_data/",
    qc_flags = "/level_2/quality_control/qc_flags/",
    qc_reports = "/level_2/quality_control/qc_reports/",
    fetch = "/level_2/fetch_filter/",
    storage = "/level_2/storage_flux/",
    gf_input = "/level_2/gf_input_data/",
    level3 = "/level_3/",
    gap_fill = "/level_3/gap_fill/",
    gf_output = "/level_3/gap_fill/gf_output_data",
    gf_plots = "/level_3/gap_fill/plots/",
    gf_reports = "/level_3/gap_fill/gf_reports/",
    ustar_filter = "/level_3/ustar_filter/",
    summary = "/level_3/summary/",
    summary_plots = "/level_3/summary/plots/")
  l <- lapply(l, function(x) paste0(root, x))
  if (create_dirs) invisible(lapply(l, dir.create, recursive = TRUE, ...))
  l
}

## =============================================================================
#' Initialize processing logbook
#'
init_log <- function(path, site, year) {
  file_path <- paste0(path, "/", site, "_", year, "_processing_log.txt")
  if (file.exists(file_path)) {
    stop(paste(
      "A logbook already exists in this location.",
      "To create a new logbook, the current logbook must be manually deleted."
    ), call. = FALSE)
  }
  init <- c(
    paste("Logbook for", site, year, "dataset."),
    paste("Created:", Sys.time()),
    ""
  )
  writeLines(init, file_name, sep = "\n")
}

## =============================================================================
#' Write to processing logbook
#'
#' @param script
#' @param output
#' @param docu
#' @param notes
#' @param path
write_log <- function(script, output, docu, notes, path = paths$logbook) {
  entry <- c()
  if (!missing(script)) {
    entry <- c(
      entry, paste0("Ran script: ", basename(script)),
      "Full script path:", script
    )
  }
  if (!missing(output)) {
    # Add numbers to strings
    for (i in 1:length(output)) {
      output[i] <- paste0("[", i, "] ", output[i])
    }
    entry <- c(entry,  "Output files created:", output)
  }
  if (!missing(docu)) {
    # Add numbers to strings
    for (i in 1:length(docu)) {
      docu[i] <- paste0("[", i, "] ", docu[i])
    }
    entry <- c(entry, "With documentation:", docu)
  }
  if (!missing(notes)) {
    entry <- c(entry, "Notes:", notes)
  }
  # Get logbook
  log <- readLines(path)
  # Amend logbook with new entry, save back in original location
  writeLines(c(log, "", as.character(Sys.time()), entry), path, sep = "\n")
  cat("Logged entry for ", as.character(Sys.time()), ".", sep = "")
}

## =============================================================================
#' (Internal) Find Rows or Columns with Only NAs Over Array Margins
#'
#' \code{all_na} returns a logical vector or array or list indicating whether
#' there are only \code{NA} values in selected margins and therefore e.g.
#' statistics like \code{max} or \code{min} do not produce useful results.
#'
#' @param x An array, including a matrix.
#' @param margin A vector giving the subscripts which the function will be
#'   applied over. E.g., for a matrix \code{1} indicates rows, \code{2}
#'   indicates columns, \code{c(1, 2)} indicates rows and columns. Where
#'   \code{x} has named dimnames, it can be a character vector selecting
#'  dimension names.
#'
#' @family NA handlers
#'
#' @seealso \code{\link{NA}} for general information about NAs and
#'   \code{\link{apply}} for \code{apply} description.
#'
#' @examples
#' xx <- matrix(1:20, nrow = 4)
#' xx[2, ] <- NA
#' all_na(xx, 2)  # All columns have at least one non-missing value
#' all_na(xx, 1)  # Second row has all values missing
#' ## Skip the all_na row in apply()
#' apply(xx[!all_na(xx, 1), ], 1, max, na.rm = TRUE)
all_na <- function(x, margin) {
  apply(x, margin, function(x) all(is.na(x)))
}

## =============================================================================
#' Object Attributes Varnames and Units
#'
#' \code{varnames} and \code{units} are useful attributes that can store
#' original variable names (\code{varnames}) and units of measurement
#' (\code{units}) of each column in a data frame or of an atomic type. These
#' attributes can be extracted or assigned by following functions.
#'
#' Functions check whether the extracted or assigned attributes contain elements
#' with \code{NULL}, \code{NA}, \code{""} values or if length of each element is
#' higher than \code{1}. In these cases, such elements are substituted with
#' \code{"-"}.
#'
#' @return
#' For \code{varnames} and \code{units}, a character vector.
#' For \code{varnames<-} and \code{units<-}, the updated object \code{x}.
#'
#' @param x A data frame or an atomic type.
#' @param names A logical value. Applies only in case of data frames. If
#'   \code{TRUE}, attributes are extracted with corresponding column names.
#' @param value An atomic type that represents \code{varnames} or \code{units}.
#'   The length must be \code{1} if \code{x} is an atomic type or equal to
#'   number of columns in \code{x} if \code{x} is a data frame.
#'
varnames <- function(x, names = FALSE) {
  if (is.data.frame(x)) {
    varnames <- lapply(x, attr, "varnames")
    varnames <- lapply(varnames, function(x) if (
      is.null(x) || x %in% c("", NA) || (length(x) != 1))
      "-" else as.character(x))
    varnames <- unlist(varnames, use.names = names)
    return(varnames)
  } else if (is.atomic(x)) {
    varnames <- attr(x, "varnames")
    varnames <- dplyr::if_else(
      is.null(varnames) || varnames %in% c("", NA) || (length(varnames != 1)),
      "-",
      as.character(varnames)
    )
    return(varnames)
  } else stop("'x' must be a data frame or an atomic type.")
}

#' @rdname varnames
`varnames<-` <- function(x, value) {
  check_input(value, "atomic")
  if (is.data.frame(x)) {
    len <- ncol(x)
    if (len != length(value)) {
      stop("Length of 'value' not equal to number of columns in 'x'.")
    }
    value <- lapply(value, function(x) if (
      is.null(x) || x %in% c("", NA) || (length(x) != 1))
      "-" else as.character(x))
    value <- unlist(value, use.names = FALSE)
    for (i in seq_len(len)) {
      attr(x[, i], "varnames") <- value[i]
    }
    return(x)
  } else if (is.atomic(x)) {
    if (length(value) != 1) {
      stop("length of 'value' must be 1 for atomic type 'x'")
    }
    value <- dplyr::if_else(
      is.null(value) || value %in% c("", NA),
      "-",
      as.character(value)
    )
    attr(x, "varnames") <- value
    return(x)
  } else stop("'x' must be a data frame or an atomic type.")
}

#' @rdname varnames
units <- function(x, names = FALSE) {
  if (is.data.frame(x)) {
    units <- lapply(x, attr, "units")
    units <- lapply(units, function(x) if (
      is.null(x) || x %in% c("", NA) || (length(x) != 1))
      "-" else as.character(x)
    )
    units <- unlist(units, use.names = names)
    return(units)
  } else if (is.atomic(x)) {
    units <- attr(x, "units")
    units <- dplyr::if_else(
      is.null(units) || units %in% c("", NA) || (length(units) != 1),
      "-",
      as.character(units)
    )
    return(units)
  } else stop("'x' must be a data frame or an atomic type.")
}

#' @rdname varnames
`units<-` <- function(x, value) {
  check_input(value, "atomic")
  if (is(x, "difftime")) {
    base::units(x) <- value
    return(x)
  }
  if (is.data.frame(x)) {
    len <- ncol(x)
    if (len != length(value)) {
      stop("Length of 'value' not equal to number of columns in 'x'")
    }
    value <- lapply(value, function(x) if (
      is.null(x) || x %in% c("", NA) || (length(x) != 1))
      "-" else as.character(x))
    value <- unlist(value, use.names = FALSE)
    for (i in seq_len(len)) {
      attr(x[, i], "units") <- value[i]
    }
    return(x)
  } else if (is.atomic(x)) {
    if (length(value) != 1) {
      stop("Length of 'value' must be 1 for atomic type 'x'.")
    }
    value <- dplyr::if_else(
      is.null(value) || value %in% c("", NA),
      "-",
      as.character(value)
    )
    attr(x, "units") <- value
    return(x)
  } else stop("'x' must be a data frame or an atomic type.")
}

## =============================================================================
#' Import Data with Units
#'
#' Reads tabular data from a file and represents them as data frame. Attributes
#' \code{varnames} (representing variable names) and \code{units} (representing
#' units of measurement or space efficient metadata) are assigned to each
#' column.
#'
#' \code{read_data} extends the possibilities of \code{\link{read.table}} so it
#' can also read units of measurement. However, it uses default arguments of
#' \code{\link{read.csv}} to accomodate loading of data for the most common
#' input type. \code{read_data} also sets useful defaults common for eddy
#' covariance (\emph{eddy}) data. Missing values are often reported as
#' \code{"-9999.0"} or \code{"-9999"} by post-processing software, therefore
#' \code{na.strings = c("NA", "-9999.0", "-9999")} is used as default.
#'
#' Attribute \code{varnames} contains original variable name without automated
#' corrections/simplifications. This provides control over conversion of
#' original column names and keeps variable names of vectors when they are
#' separated from the original data frame.
#'
#' Units are expected to be one line below the header in the input file. Instead
#' of units of measurement, it is possible to include any space efficient
#' metadata that is relevant to the respective variables. E.g. format of
#' timestamp or structure of coded variable. One line below units and further in
#' the input file is the region with data. Any missing values or blank fields
#' (converted to empty strings) in the line interpreted as units will be
#' substituted by \code{units_fill} string instead.
#'
#' The automated check for \code{"-10000"} values in the data region is provided
#' by \code{check_input = TRUE} (default) and produces error message if the
#' value is found. The \code{"-10000"} values can be introduced to the dataset
#' by rounding \code{"-9999"} values due to the incorrect file conversion or
#' data manipulation. Using  \code{check_input = FALSE} will skip the check
#' (this could improve performance for large input files).
#'
#' Adapted from: openeddy - read_eddy
#'
#' @return A data frame is produced with additional attributes \code{varnames}
#'   and \code{units} assigned to each respective column.
#'
#' @param file The file name with input data to be read. It can be a file name
#'   inside the current working directory, \emph{relative} or \emph{absolute}
#'   path or \code{\link{connection}}. See \code{\link{read.table}} for more
#'   detailed description. Connections to anonymous file or clipboard are not
#'   allowed. To read from clipboard use \code{"clipboard"} string instead of
#'   connection.
#' @param header A logical value indicating whether the names of variables are
#'   included as the first line of the input file. If \code{FALSE}, column names
#'   and variable names of attribute \code{varnames} will be automatically
#'   generated.
#' @param units A logical value indicating whether the units for respective
#'   variables are included one line above the data region in the input file. If
#'   \code{FALSE}, the \code{units} attribute of each column will be set to
#'   \code{units_fill} string representing missing values.
#' @param sep A character that separates the fields of input. Default separator
#'   for CSV files is \code{","}. See \code{\link{read.table}} for other
#'   options.
#' @param dec
#' @param quote A character string that contains the quoting characters.
#' @param units_fill A character string that represents missing value of
#'   \code{units} attribute.
#' @param nrows An integer specifying the maximum number of rows to read in.
#'   Negative and other invalid values are ignored.
#' @param skip An integer. The number of lines to skip in the input file before
#'   reading data.
#' @param na.strings A character vector of strings representing \code{NA} values
#'   in the input file. Blank fields are also considered to be missing values in
#'   logical, integer, numeric and complex fields.
#' @param fill
#' @param check_input A logical value that determines if values in the input
#'   will be checked for erroneous \code{"-10000"} value. If \code{TRUE}
#'   (default), any encountered \code{"-10000"} value in the data will trigger
#'   an error message.
#' @param correct A logical value that determines if units and varnames should
#'   undergo standard formatting corrections. Defaults to \code{TRUE} and prints
#'   a message if names have been properly corrected.
#' @param comment.char A character that is interpreted as comment or empty
#'   string to turn off this behaviour.
#' @param col_classes
#' @param tz
#' @param ... Further arguments passed to \code{read.table}
#'
#' @export
read_data <- function(file, header = TRUE, units = TRUE, sep = ",", dec = ".",
                      quote = "\"", units_fill = "-", nrows = -1, skip = 0,
                      na.strings = c("NA", "-9999.0", "-9999"), fill = TRUE,
                      check_input = TRUE, correct = TRUE, comment.char = "",
                      col_classes = NA, tz = metadata$tz_name, ...) {
  if (!missing(file) && inherits(file, "connection")) {
    if (summary(file)$description == "") {
      stop("Connection to anonymous file is not allowed.")
    }
    if (summary(file)$description == "clipboard") {
      stop("Connection to clipboard is not allowed; use 'clipboard' string.")
    }
  }
  read_header <- header
  units_fill <- as.character(units_fill)
  check_input(units_fill, "length_1")
  var_units <- read.table(
    file,
    header = read_header, sep = sep,
    quote = quote, dec = dec, na.strings = na.strings,
    colClasses = "character", nrows = 1, skip = skip,
    fill = fill, comment.char = comment.char, ...
  )
  if (header) {
    orig_varnames <- read.table(
      file,
      header = FALSE, sep = sep, quote = quote, dec = dec,
      na.strings = na.strings, colClasses = "character", nrows = 1,
      skip = skip, fill = fill, comment.char = comment.char, ...
    )
  }
  if (correct) {
    var_units <- var_units %>% correct("units")
    unit_names <- orig_varnames %>% unlist(use.names = FALSE) %>% correct()
    names(var_units) <- unit_names %>% make.unique(sep = "_")
    message(
      "Units and varnames re-formatted using default corrections. ",
      "See documentation of 'correct()' for details."
    )
  }
  if (header && units) {
    read_header <- FALSE
    skip <- skip + 2
  } else if (header | (!header && !units)) {
    var_units[] <- units_fill
  } else {
    skip <- 1 + skip
  }
  data <- read.table(
    file,
    header = read_header, sep = sep, quote = quote,
    dec = dec, na.strings = na.strings,
    colClasses = col_classes, nrows = nrows, skip = skip,
    fill = fill, comment.char = comment.char, ...
  )
  if (header && units) colnames(data) <- names(var_units)
  # Create timestamp from date and time columns if none exists
  orders <- c("Ymd HM", "mdy HM", "Ymd HMS", "mdy HMS")
  if (!"timestamp" %in% names(data)) {
    data <- data %>%
      dplyr::mutate(
        timestamp = lubridate::parse_date_time(paste(date, time), orders, tz)
      ) %>%
      dplyr::select(timestamp, dplyr::everything())
    orig_varnames <- c("timestamp", orig_varnames)
    var_units <- c("-", var_units)
  } else {
    data <- data %>%
      dplyr::mutate(
        timestamp = lubridate::parse_date_time(timestamp, orders, tz)
      ) %>%
      dplyr::select(timestamp, dplyr::everything())
  }
  if (units) var_units[var_units %in% c("", NA)] <- units_fill
  #browser()
  for (i in seq_len(ncol(data))) {
    #varnames(data[, i]) <- if (header) orig_varnames[[i]] else colnames(data)[i]
    #units(data[, i]) <- var_units[i]
    varnames(data[i]) <- if (header) orig_varnames[[i]] else colnames(data)[i]
    units(data[i]) <- var_units[[i]]
  }
  # Recode dates and times from timestamp
  data <- data %>%
    dplyr::mutate(
      date = lubridate::date(timestamp),
      time = format(timestamp, "%H:%M")
    )
  if (check_input) {
    check <- as.vector(data == -10000)
    if (any(check[!is.na(check)])) {
      message("Missing data incorrectly replaced by -10000 in the input.")
    }
  }
  if (!is(data$timestamp, "POSIXct") | anyNA(data$timestamp)) {
    warning(
      "Timestamp could not parse to POSIXct. ",
      "Check the format and convert without re-reading using ",
      "'strptime_eddy()'."
    )
  }
  return(data)
}

## =============================================================================
#' Combine Multiple EddyPro Output Files into One Dataset
#'
#' Takes files created by different EddyPro processing sessions and merges into
#' one dataset without dropping any columns.
#'
#' @return A data frame with columns bound by name each with the attributes
#'   'units' and 'varnames'.
#'
#' @param ... Zero or more objects of class data.frame that have been read using
#'   \code{\link{read_data}}.
#'
#' @export
combine_data <- function(..., fill = "-") {
  inputs <- rlang::list2(...)
  # Store full units and varnames attribute since they will be dropped
  units <- list()
  varnames <- list()
  for (i in seq_along(inputs)) {
    units[[i]] <- tidyflux::units(inputs[[i]], names = TRUE)
    varnames[[i]] <- varnames(inputs[[i]], names = TRUE)
  }
  names(units) <- rlang::ensyms(...)
  units_full <- units[[1]]
  names(varnames) <- rlang::ensyms(...)
  varnames_full <- varnames[[1]]
  for (i in 2:length(units)) {
    units_full <- suppressWarnings(dplyr::bind_rows(units_full, units[[i]]))
    varnames_full <- suppressWarnings(
      dplyr::bind_rows(varnames_full, varnames[[i]])
    )
  }
  # Replace NA units/varnames with "-"
  units_full <- units_full %>% replace(is.na(.), fill)
  varnames_full <- varnames_full %>% replace(is.na(.), fill)
  # Test for equality among input data frames in units/varnames of each variable
  for (i in seq_along(units_full)) {
    if (!length(unique(units_full[, i])) == 1) {
      warning(
        "Units of variable ", names(units_full[i]),
        " are not identical across all input data frames."
      )
    }
    if (!length(unique(varnames_full[, i])) == 1) {
      warning(
        "Varnames of variable ", names(varnames_full[i]),
        " are not identical across all input data frames."
      )
    }
  }
  units_full <- units_full %>% dplyr::slice(1) %>% unlist()
  varnames_full <- varnames_full %>% dplyr::slice(1) %>% unlist()
  #browser()
  # Bind all data frames together
  out <- suppressWarnings(dplyr::bind_rows(...))
  # Define helper function for getting a count of duplicate timestamps
  dupes <- function(data) {
    data %>%
      dplyr::add_count(timestamp) %>%
      dplyr::ungroup() %>%
      dplyr::summarize(n = sum(n > 1, na.rm = TRUE)) %>%
      dplyr::pull(n)
  }
  # If there are duplicate timestamps, choose which ones to keep
  if (dupes(out) > 0) {
    # First remove records that explicitly do not contain data
    if ("file_records" %in% names(out)) {
      out <- out %>%
        dplyr::filter(!is.na(file_records))
    } else if ("vin_1" %in% names(out)) {
      out <- out %>%
        dplyr::filter(!is.na(vin_1))
    }
    # For any remaining duplicates, keep the one with the most non-NA data
    if (dupes(out) > 0) {
      out <- out %>%
        dplyr::mutate(n_na = rowSums(is.na(.))) %>%
        dplyr::arrange(n_na) %>%
        dplyr::group_by(timestamp) %>%
        dplyr::slice(1) %>%
        dplyr::select(-n_na) %>%
        dplyr::ungroup()
      if (dupes(out) > 0) {
        # Ensure no duplicates are left
        dupes_left <- out %>%
          dplyr::add_count(timestamp) %>%
          dplyr::filter(n > 1) %>%
          dplyr::distinct(timestamp) %>%
          dplyr::pull(timestamp) %>%
          dplyr::ungroup()
        out <- out %>%
          dplyr::distinct(timestamp, .keep_all = TRUE)
        # Produce warning since this arbitrarily chooses which records to keep
        warning(
          "Duplicate records with timestamp ", dupes_left,
          "could not be removed using logical methods. ",
          "Kept records were chosen arbitrarily."
        )
      }
    }
  }
  # Assign units and varnames back to the combined data frame
  for (i in seq_len(ncol(out))) {
    varnames(out[[i]]) <- varnames_full[[i]]
    units(out[[i]]) <- units_full[[i]]
  }
  #tidyflux::units(full) <- units_full
  #varnames(full) <- varnames_full
  # Check if timestamp forms a regular sequence
  #check_times(full$timestamp)
  out
}

## =============================================================================
#' Export Eddy Covariance Data
#'
#' Facilitates printing object \code{x} also with its units of measurement (or
#' space efficient metadata) to a file or \code{\link{connection}}.
#'
#' \code{write_data} extends the possibilities of \code{write.table} so the
#' units of measurement can also be written. However, it uses default arguments
#' of \code{write.csv} to provide flexibility for the user and to accomodate for
#' the most common case. The character string \code{"-9999"} is typically used
#' to represent missing values in eddy covariance data.
#'
#' Storing \code{varnames} and \code{units} attributes is practical mostly
#' within data frames and vectors. Attribute \code{varnames} extracted from each
#' data frame column represents names of respective variables and its main
#' purpose is to keep variable names of isolated vectors. Attribute \code{units}
#' extracted from each column represents units of measurement (or space
#' efficient metadata) of respective variables that are written one line above
#' data region. If the \code{varnames} or \code{units} attribute of given column
#' is \code{NULL}, of length not equal to 1, or contains missing value or empty
#' string, it is not considered meaningful. In that case the default column name
#' produced by \code{\link{as.data.frame}} is used instead (considered only if
#' \code{x} is supplied as vector) and unit of measurement is substituted with
#' \code{units_fill} string. \code{units_fill} can be an empty string.
#'
#' Units of measurement are considered to be part of the output header and
#' therefore \code{col.names} and \code{quote} arguments have the effect on the
#' way they are written.
#'
#' Adapted from: openeddy - write_eddy
#'
#' @param x A data frame to be written with optional attributes \code{units} and
#'   \code{varnames} assigned to each column. If not a data frame it is
#'   converted by \code{\link{as.data.frame}}.
#' @param file A character string naming a file to write or a
#'   \code{\link{connection}} that is open for writing. \code{""} results in
#'   writing to the console.
#' @param append A logical value, used only if \code{file} is not a
#'   \code{connection}. If \code{TRUE}, output is written below the content of
#'   the file. If \code{FALSE}, the content of the file is overwritten.
#' @param quote A logical value (\code{TRUE} or \code{FALSE}) or a numeric
#'   vector. If \code{TRUE}, columns of class character or factor will be
#'   surrounded by double quotes. If a numeric vector, its elements should mark
#'   the indices of character or factor columns to quote. In both cases, row and
#'   column names and units are quoted if present. If \code{FALSE}, no quoting
#'   is performed.
#' @param sep A character used as the field separator of each row.
#' @param units_fill A character string representing missing values of
#'   \code{units} attribute in the output.
#' @param na A character string representing missing data values in the output.
#' @param col.names Either a logical value (\code{TRUE}, \code{FALSE} or
#'   \code{NA}) or a character vector. If \code{TRUE}, column names of \code{x}
#'   will be included in the output. If a character vector, its elements will be
#'   used as column names. If \code{x} is supplied as vector, an attempt is made
#'   to extract meaningful variable name from its attribute \code{varnames}.
#' @param ... Further arguments passed to \code{\link{write.table}}.
#'
#' @export
write_data <- function(x, file = "", append = FALSE, quote = TRUE, sep = ",",
                       include_units = TRUE, fill = "-", na = "-9999",
                       col.names = TRUE, use_varnames = FALSE, ...) {
  check_input(x, "data_frame")
  if (isTRUE(col.names) | is.na(col.names) | is.character(col.names)) {
    if (include_units) {
      fill <- as.character(fill)
      check_input(fill, "length_1")
      units <- lapply(x, attr, "units")
      units <- lapply(units, function(x) if (
        is.null(x) || x %in% c("", NA) || (length(x) != 1)) {
        fill
      } else {
        as.character(x)
      })
      header <- as.data.frame(units, stringsAsFactors = FALSE)
    } else header <- x
    if (use_varnames) {
      if (any(varnames(x) %in% c("-", "--"))) {
        nonames <- which(varnames(x) %in% c("-", "--"))
        varnames(x)[nonames] <- names(x)[nonames]
      }
      names(header) <- varnames(x)
    }
    q_header <- if_else(isTRUE(quote) | is.numeric(quote), TRUE, quote)
    if (append) warning("Appending column names to file.")
    write.table(
      header, file, append = append, quote = q_header, sep = sep, na = na,
      row.names = FALSE, col.names = col.names, qmethod = "double", ...
    )
    append <- TRUE
  }
  if (include_units) {
    write.table(
      x, file, append = append, quote = quote, sep = sep, na = na,
      row.names = FALSE, col.names = FALSE, qmethod = "double", ...
    )
  }
}

## =============================================================================
#' Apply Storage Flux Correction
#'
#' Correction of matter or energy flux (\code{flux}) with storage computed using
#' discrete (one point) approach (\code{st}) or profile measurement of CO2
#' concentration (\code{stp}).
#'
#' If both storage estimates are available, \code{stp} takes priority. If both
#' \code{st} and \code{stp} estimates are \code{NA}, original flux value is
#' kept. \code{flux}, \code{st} and \code{stp} (if not NULL) must have the same
#' length.
#'
#' Adapted from: openeddy - add_st
#'
#' @return A vector with attributes \code{varnames} and \code{units} is
#'   produced. \code{varnames} value is set by \code{name_out} argument.
#'   \code{units} value is extracted from \code{flux} vector by
#'   \code{\link{units}} or set to default \code{"-"}.
#'
#' @param flux A numeric vector with flux values.
#' @param st A numeric vector with storage computed using discrete (one point)
#'   approach.
#' @param stp A numeric vector with storage computed using profile measurement
#'   of gas concentration.
#'
#' @export
add_storage <- function(x, st, stp = NULL) {
  check_input(x, "numeric")
  check_input(st, "numeric")
  if (length(x) != length(st)) {
    stop("'x' and 'st' must be of the same length.")
  }
  if (!is.null(stp)) {
    if (length(x) != length(stp)) {
      stop("'stp' must be of the same length as 'x'.", call. = FALSE)
    }
    check_input(stp, "numeric")
  }
  out <- x
  out[!is.na(st)] <- x[!is.na(st)] + st[!is.na(st)]
  if (!is.null(stp)) {
    out[!is.na(stp)] <- x[!is.na(stp)] + stp[!is.na(stp)]
  }
  varnames(out) <- paste0(
    "add_storage(", attr(x, "varnames"), ", ", attr(st, "varnames"), ")"
  )
  tidyflux::units(out) <- tidyflux::units(x)
  return(out)
}

## =============================================================================
#' Expand Vector to Full Year
#'
#' Generates a vector with half-hourly time steps of full year, stamped in the
#' middle of time unit.
#' Adapted from: REddyProc - fExpandToFullYear
#'
#' @param x A data vector to be expanded.
#' @param timestamp The data timestamp in POSIX time format.
#' @param year Year (e.g. to plot).
#' @param DTS The number of daily time steps (i.e. 24 or 48).
#' @param tz Time zone used, advised to keep default.
#'
#' @details
#' Function to expand vectors to full year, e.g. to plot in correct time format.
#'
#' @return
#' The expanded time and data vector as a data frame.
#'
expand_year <- function(x, timestamp, year, dts, tz = get_tz(timestamp)) {
  year_exp <- as.numeric(format(timestamp, "%Y"))
  # Check if year within time span of data set
  if (sum(year == year_exp) == 0) {
    stop("Year ", year, " not within time span of this dataset.")
  }
  full_year <- create_timesteps(year, dts, tz = tz)
  time_year <- timestamp[(year == year_exp)]
  data_year <- x[(year == year_exp)]
  if (sum(!is.na(data_year)) == 0) {
    out <- data.frame(dplyr::bind_cols(
      timestamp = full_year,
      data = rep(NA, length(full_year))
    ))
    warning(
      "Variable '", attr(x, "varnames"), "' contains no data for year ",
      year, "."
    )
  } else if (length(time_year != length(full_year))) {
    out <- dplyr::inner_join(
      dplyr::bind_cols(timestamp = full_year),
      dplyr::bind_cols(timestamp = time_year, data = data_year),
      by = "timestamp", all = TRUE, sort = TRUE
    )
  } else {
    out <- data.frame(dplyr::bind_cols(timestamp = time_year, data = x))
  }
  out$timestamp <- .POSIXct(out$timestamp, tz = tz)
  attr(out$data, "varnames") <- attr(x, "varnames")
  attr(out$data, "units") <- attr(x, "units")
  return(out)
}

## =============================================================================
#' Generate Year-long Half-hourly Time Step Vector
#'
#' Generate vector with half-hourly time steps of full year, stamped in the
#' center of time unit.
#' Adapted from: REddyProc - fFullYearTimeSteps
#'
#' @param data A data frame to be converted.
#' @param DTS The number of daily time steps (24 or 48).
#' @param tz The time zone used to store data (advised to keep GMT to avoid
#'   daytime shifting issues).
#'
#' @return
#' A vector with time steps of full year in POSIX format.
#'
create_timesteps <- function(year, dts = 48, tz = "GMT", shift_by = 720 / dts) {
  if (!dts %in% c(24, 48)) {
    stop("Only implemented for 24 or 48 daily time steps, not ", dts, ".")
  }
  format <- "%Y-%m-%d-%H-%M"
  start <- paste(year, 1, 1, 0, shift_by, sep = "-")
  end <- paste(year + 1, 1, 1, 0, 30 - shift_by, sep = "-")
  # timestamp vector with half-hourly timestamps
  out <- seq(
    strptime(start, format, tz),
    strptime(end, format, tz),
    (24 / dts * 60 * 60)
  )
  return(out)
}

## =============================================================================
#' Set File Name with Path
#'
#' Set file name with path and check if directory and/or file exists.
#' Adapted from: REddyProc - fSetFile
#'
#' @param file_name File name as a string.
#' @param dir Directory as a string.
#' @param IO Input / output flag, TRUE for input, FALSE for output.
#'
#' @return
#' The name of the file with its complete path.
#'
set_file <- function(file_name, dir, IO) {
  str <- check_value(dir, "str")
  # Check if directory exists
  if (IO && str && (file.access(dir, mode = 4) != 0)) {
    stop("Directory does not exist: ", dir)
  }
  # Make directory if mode is output
  if (!IO && str && (file.access(dir, mode = 0) != 0)) {
    dir.create(dir)
    message("Directory created: ", dir)
    if (file.access(dir, mode = 2) != 0) {
      stop("Directory could not be created: ", dir)
    }
  }
  # Set file name accordingly
  file <- if (str) file.path(dir, file_name) else file_name
  # If input file, check if file exists
  if (IO && (file.access(file, mode = 4) != 0)) {
    stop("File does not exist or has no read permission: ", file)
  }
  return(file)
}

## =============================================================================
#' Create dataset with daily means/sums
#'
#' @param data
#' @param timestamp
#' @param mean_vars
#' @param sum_vars
#' @param omit_vars
#'
#' @return
#' @export
#'
#' @examples
summarize_daily <- function(data, timestamp = timestamp, mean_vars = NULL,
                            sum_vars = NULL, omit_vars = NULL) {
  timestamp <- rlang::enquo(timestamp)
  if (!missing(mean_vars)) mean_vars <- rlang::enquo(mean_vars)
  if (!missing(sum_vars)) sum_vars <- rlang::enquo(sum_vars)
  if (!missing(omit_vars)) omit_vars <- rlang::enquo(omit_vars)
  all_units <- tidyflux::units(data, "names")
  data <- data %>%
    dplyr::mutate(
      !!rlang::as_name(timestamp) := !!timestamp - 900,
      date = lubridate::date(!!timestamp)
    ) %>%
    dplyr::select(-!!timestamp) %>%
    {if (!is.null(omit_vars)) {
      dplyr::select(., -!!omit_vars)
    } else .}
  means <- data %>%
    {if (!is.null(mean_vars)) {
      dplyr::select(., date, !!mean_vars)
    } else .} %>%
    {if (!is.null(sum_vars)) {
      dplyr::select(., -!!sum_vars)
    } else .} %>%
    dplyr::group_by(date) %>%
    dplyr::summarize_all(list(mean), na.rm = TRUE)
  sums <- data %>%
    {if (!is.null(sum_vars)) {
      dplyr::select(., date, !!sum_vars)
    } else .} %>%
    dplyr::group_by(date) %>%
    dplyr::summarize_all(list(sum), na.rm = TRUE)
  #tidyflux::units(sums) <- sums_units
  #varnames(sums) <- paste0("sum(", names(sums), ")")
  dplyr::full_join(means, sums)
}
grahamstewart12/tidyflux documentation built on June 4, 2019, 7:44 a.m.