sapflux: Analyze Sapflux Traces

#' @title Flux data import function. Also has options to pre-process data inputs.
#'
#' @description
#'
#' Imports flux data from multiple files, processes according to optional input
#' parameters, and ouputs an R object according to additional optional
#' parameters.
#'
#' See 'details' for detailed notes on parameter use, supported file and
#' datatypes, and function operation.
#'
#' @param files            Character vector of filenames.
#' @param time_format      Format of timestamp. See 'details'.
#' @param file_extensions  Format of imported data file. See 'details'.
#' @param data_structures  Data format. See 'details'.
#' @param minimum_interval Minimum file length, in minutes, to import.
#' @param verbose          Prints extra debugging info.
#'
#' @details
#'
#' \code{files}:
#'
#' A list of filenames, in the current working directory. Timestamp vector
#' must be present in each file, and must be the only non-numeric
#' column.
#'
#' The default, 'all', will search the current working directory
#' for all files that have supported file extensions
#' (.txt, .csv, and .dat) and import all appropriate files. While useful
#' for bulk file input, it's recommended to list the files out
#' individually if there are not too many to help sanitize data inputs.
#'
#' \code{datatypes}:
#'
#' Type of sapflux data, one per file. If length 1, is recycled
#' for each file in the input list. Otherwise, its length must
#' match the 'files' length exactly, and it is assumed that datatypes
#' are ordered to match the input file list as it would appear to
#' list.files().
#'
#' \code{time_format}:
#' Same as 'datatypes', but for timestamp format to be fed to
#' 'strptime()' and 'as.POSIXct()'.
#'
#' \code{file_extensions}:
#' Works the same as time_format, but for imported file extensions.
#'
#' \code{dat_structures}:
#' Same as time_format and datatypes, but for more specific datatypes. Not
#' all 'dat_structures' values work for all 'file_extensions', if the order
#' is off or you've provided a bad input it will return an error. Value
#' doesn't matter for .csv files.
#'
#' Currently accepted data structures: "campbell" and "tab_delimited".
#'
#' Additional notes:
#'
#' Campbell CR1000 files have additional metadata associated with them
#' along with the data of interest. Usually, this metadata isn't
#' especially relevant. FluxImport will discard additional
#' metadata and return only a matrix of data values and associated
#' timestamps.
#'
#' @export
#' @family preprocess
#'
#' @examples
#' # If logger applied a multiplier for you, use 'datatype = 'temperatures''
#' ImportRawFlux(files = c(myfil1.dat, myfile2.dat),
#'            dat_structures = "campbell", datatypes = "temperatures")
ImportRawFlux <- function(
  files = "all",
  datatypes = "voltages",
  time_formats = "%Y-%m-%d %H:%M:%S",
  time_columns = 1,
  data_structures = "CampbellSci",
  minimum_interval = 120,
  verbose = FALSE
) {
  # Variable aliases:
  dt <- datatypes
  tf <- time_formats
  tc <- time_columns
  ds <- data_structures
  mi <- minimum_interval
  vb <- verbose
  # Input checks
  stopifnot(
    class(files) == 'character' && is.vector(files),
    class(tf) == 'character' && length(tf) == 1,
    class(ds) == 'character',
    class(vb) == 'logical' && length(vb) == 1
  )
  # Find files to work over:
  message('Checking for file list in current working directory...')
  if (length(files) == 1 && files == 'all') {
    message('Importing all files in current working directory...')
    # The regular expression keeps breaking. One of these definitions for
    # aft is working, depending on the day.
    #aft <- "\\.txt$ | \\.dat$ | \\.csv$"
    aft <- list('.txt', '.csv', '.dat')
    files <- list.files()[unlist(lapply(aft, function(x) grep(x, list.files())))]
  }
  if (length(files) < 1) stop('Could not find files.')
  # Replicate input variable length if == 1, then check for correct length.
  for (i in c('dt', 'tf', 'tc', 'ds')) { # Vars are shorthand, check aliases
    if (length(get(i)) == 1) {
      assign(i, rep(get(i), length(files)))
      stopifnot(length(get(i)) == length(files))
    }
  }
  # Import files:
  raw_data <- list()
  for (j in 1:length(files)) {
    # Read in data:
    fl <- files[[j]]
    message(paste('Read:', fl))
    argums <- c(list(file = fl), SetFileInfo(ds[j]))
    header <- SetHeaderInfo(fl, ds[j])(fl)
    jdat <- do.call(read.table, argums)
    if (vb) {
      cat('j_data head:\n')
      print(head(jdat))
    }
    # Check for time column:
    # 'read.table' can't handle full columns of NaN, will import as character.
    # This
    if (class(jdat[, tc[j]]) != 'character') stop('Cant find time column.')
    if (table(sapply(jdat, class))['character'] > 1) {
      jdat[, -tc[j]] <- lapply(jdat[, -tc[j]], as.numeric)
      jdat[, -tc[j]] <- lapply(jdat[, -tc[j]], function(x) ifelse(is.nan(x), NA, x))
      # Sometimes this breaks/doesn't work not sure why, stop() here is QC.
      if (table(sapply(jdat, class))['character'] > 1) {
        stop(cat('Too many character columns in', fl))
      }
    }
    # Create a timestamp vector. ####
    # tryCatch() is used here to halt on warnings thrown by
    # striptime(), avoiding cryptic or silent data import errors.
    jt <- .POSIXct(list())
    tryCatch(expr = {
      jt <- as.POSIXct(strptime(jdat[, tc[j]], format = tf[j]))
    }, warning = function(cond) {
      fail <- cond
      invisible()
    }, error = function(cond) {
      fail <- cond
      invisible()
    }, finally = {
      if (exists('fail')) {
        print(fail)
        stop('Failed to format time correctly, see warning')
      } else {
        # Check time interval
        ji <- abs(as.numeric(difftime(
          time1 = jt[1], time2 = jt[length(jt)], units = 'mins'
        )))
        if (vb) cat('time interval:\n', ji, jt[1], jt[length(jt)], '\n')
        if (ji < mi) {
          cat(fl, 'is insufficient length, skipped')
          next
        }
        jdat[, tc[j]] <- jt
      }
    })
    # Move time column to column 1:
    jdat <- data.frame(jdat[, tc[j]], jdat[, -tc[j]])
    colnames(jdat)[1] <- 'time'
    jdat <- jdat[order(jdat[, 1]), ]
    raw_data <- c(raw_data, list(jdat))
  }
  # Wrap-up and return. ####
  flux <- new(Class = 'flux')
  flux@source_files <- files
  flux@datatype <- dt
  flux@raw_data <- raw_data
  flux@time_id <- tc
  message('Done importing!')
  log_message <- paste('Imported', length(files), 'files.')
  flux@log <- c(flux@log, log_message)
  return(flux)
}