sapflux: Analyze Sapflux Traces

#' Prints summary data and plots small segments of sapflux or VWC data.
#'
#' @description
#' This function rapidly reads files and provides information that assists
#' in sensor troubleshooting. Data that is out-of-range for what is reasonable
#' for the measurement is removed, and tallies are made of all columns in the
#' data that have no viable data at all. Plots are printed for further visual
#' examination - some non-viable data will be left within the range of normal
#' data, even if sensors are totally broken.
#'
#' @param file            Filename, as a character.
#' @param filetype        Type of file imported. Can be "csv, "tab-delim",
#'                        or "campbell".
#' @param datatype        Sapflux or VWC data?
#' @param time.format     Format of the timestamp column (column 1!)
#' @param timeframe       How much of the most recent data to assess? In days.
#'
#' @details
#'
#' Ask B. McNellis for further details on this.
#'
#' @export
#' @examples
#' test <- CheckSensors(file = "PPINE_03_PSI_650.dat", datatype = "VWC")
CheckSensors <- function(file, filetype = "campbell", datatype = "sapflux",
                         time.format = "%Y-%m-%d %H:%M:%S",
                         timeframe = 14) {
  # Input validity checks
  cat("\n")
  cat("Input file:", file, "\n")
  cat("Data type:", datatype, "\n")
  stopifnot(
    length(file) == 1,
    class(file) == "character",
    filetype %in% c("campbell", "csv", "tab.delim"),
    class(time.format) == "character",
    length(time.format) == 1,
    datatype %in% c("sapflux", "VWC"),
    class(timeframe) == "numeric",
    length(timeframe) == 1
    )
  # Read in file, based on filetype. Copy/pasted from ImportRawFlux.R
  i <- 1
  file <- list(file)
  if (filetype[i] == "campbell") {
    row.skip <- 1
    file.sep <- ","
    file.header <- FALSE
    file.row.names <- 2
    header <- read.table(
      file = file[[i]], header = FALSE, sep = ",",
      nrow = 4, fill = TRUE, quote = "", colClasses = "character"
    )
    header <- apply(header, 2, function(x) {
      gsub(pattern = '"', replacement = "", x = x)
    })
  }
  if (filetype[i] == "csv") {
    row.skip <- 0
    file.sep <- ","
    file.header <- TRUE
    file.row.names <- NULL
  }
  if (filetype[i] == "tab.delim") {
    row.skip <- 0
    file.sep <- "\t"
    file.header <- TRUE
    file.row.names <- FALSE
  }
  data.check <- read.table(
    file[[i]], sep = file.sep,
    header = file.header, skip = row.skip,
    #row.names = file.row.names,
    na.strings = c(NA, NaN, "", " ", "  ", "   "),
    stringsAsFactors = FALSE
  )
  # Modified the row skipping for the campbell files in order to grab the
  # column names from the header. It's 4 for the sapflux function, 1 here...
  if (filetype == "campbell") {
    cols <- data.check[1, ]
    data.check <- data.check[-c(1:3), -2]
    cols <- cols[, 3:length(cols)]
    stopifnot(length(cols) == (ncol(data.check) - 1))
  }
  if (class(data.check[, 1]) != "character") {
    stop("First column needs to be timestamp")
  }
  if (table(sapply(data.check, class))["character"] > 1) {
    sub <- lapply(data.check[, 2:ncol(data.check)], as.numeric)
    sub <- lapply(sub, function(x) {
      ifelse(is.nan(x), NA, x)
    })
    data.check <- data.frame(data.check[, 1], sub, stringsAsFactors = FALSE)
    # Keep the stop in case the above process breaks for some reason.
    #stop(cat("Too many character columns in", file))
  }
  # Change time vector class
  time <- strptime(data.check[, 1], format = time.format)
  time <- as.POSIXct(time)
  if (length(time) < 1000) {
    indx <- length(time) - 1
  } else {
    indx <- 1000
  }
  intv.range <- vector(mode = "numeric", length = indx)
  for (i in 1:indx) {
    intv.i <- difftime(time[i + 1], time[i], units = "secs")
    intv.i <- as.numeric(intv.i)
    intv.range[i] <- intv.i
  }
  intv <- min(intv.range)
  total.intv <- difftime(time[length(time)], time[1])
  file.length <- round(as.numeric(total.intv), digits = 2)
  cat("Total file length:", file.length, units(total.intv), "\n")
  if (file.length < 10) {
    return(message("Not enough data to report on!"))
  }
  nrec <- (86400 * timeframe) / intv
  if (nrec > length(time)) {
    nrec <- length(time) - 1
  }
  time <- time[(length(time) - nrec):length(time)]
  data.check <- data.check[, 2:ncol(data.check)]
  data.check <- data.check[(nrow(data.check) - nrec):nrow(data.check), ]
  data.check <- abs(as.matrix(data.check))
  # Kill all the out-of-bounds data, depending on datatype
  if (datatype == "sapflux") {
    message("Data bounds on sapflux assumed to be in volts, not degrees C.")
    upper <- 2
    lower <- 0
    ylab <- "Voltages (V)"
  }
  if (datatype == "VWC") {
    message("VWC checking requires 'VWC' in data column names.")
    vwcs <- grep("VWC", cols)
    cat("Dropped", ncol(data.check) - length(vwcs), "columns.\n")
    data.check <- data.check[, vwcs]
    upper <- 1
    lower <- 0
    ylab <- "VWC, l H20/l soil"
  }
  data.check <- ifelse(data.check > upper, NA, data.check)
  data.check <- ifelse(data.check < lower, NA, data.check)
  empty <- apply(data.check, 2, function(x) {
    table(is.na(x))["TRUE"] == nrow(data.check)
  })
  empty <- which(empty == "TRUE")
  viabl <- ncol(data.check) - length(empty)
  cat("Viable columns:", viabl, "\n")
  if (viabl < 1) {
    return(message("No data to report on for this file!"))
  }
  cat("Columns", empty, "have no viable data.\n")
  not.empty <- apply(data.check, 2, function(x) {
    round(table(is.na(x))["FALSE"] / length(x), digits = 2)
  })
  cat("Fraction of each column with data:\n", as.numeric(not.empty), "\n")
  for (i in 1:ncol(data.check)) {
    empty <- table(is.na(data.check[, i]))
    if (!is.na(empty["TRUE"])) {
      if (as.numeric(empty["TRUE"]) == nrow(data.check)) {
        next
      }
    }
    plot(time, data.check[, i],
         ylab = ylab, xlab = "Time",
         main = paste(file, "\nColumn:", i))
  }
  return(data.frame(time, data.check))
}