R/BindRawFlux.R

#' @title Binding function for post-import raw flux data
#'
#' @description
#'
#' Sapflow data needs much pre-processing before primary analysis
#' and scaling - this function binds raw data files together.
#'
#' @param flux           Input data of class "flux"
#' @param drop_tolerance Tolerance, as %, for dropping data that
#'                       has no associated time vector.
#'
#' @return Returns the modified 'flux' object back.
#'
#' @export
#' @family preprocess
#' @examples
#' flux <- FluxProcess(flux = mydata)
BindRawFlux <- function(flux, drop_tolerance = 1) {
  # Input validation & defaults ####
  if (length(flux@metadata) < 1) stop('Need metadata for binding.')
  validObject(flux)
  params <- LoadDefaults(flux = flux)
  sf <- flux@source_files
  meta <- flux@metadata
  meta <- meta[which(meta$INCLUDE == 'TRUE'), ]
  if (length(flux@data) > 0) message('Overwriting data slot.')
  # Creating alises
  tl <- drop_tolerance / 100
  dt <- flux@datatype
  # Check datatypes - must all be the same BEFORE merging!
  if (length(unique(dt)) != 1) stop('Datatypes must be identical.')
  dt <- dt[1]
  dout <- data.frame()
  tags <- unique(meta$BIND)
  cf <- 0
  for (h in 1:length(tags)) {
    # Find the filename(s) associated with the data that will be bound.
    if (sum(!is.na(tags)) < 1) {
      fl <- sf
    } else {
      fl <- meta$FILE[which(meta$BIND == tags[h])]
      #fl <- sf[grep(pattern = pat, x = sf)]
    }
    fl <- sf[which(sf %in% fl)]
    if (length(fl) < 1) {
      warning(cat('No files included for bind tag ', tags[h], '\n'))
      next
    }
    # Advance the file list length counter.
    cf <- cf + length(fl)
    # Find the raw data associated with the current file name.
    hdat <- flux@raw_data[which(sf == fl)]
    # Create the time merge vector, in POSIXct:
    tmat <- matrix(data = NA, ncol = 1, nrow = 0)
    tmat <- data.frame(tmat, stringsAsFactors = F)
    # Find common time vector for all raw files in the bind pool:
    for (i in hdat) {
      ti <- i[, 1]
      tmat <- merge(x = tmat, y = ti, by = 1, all = T, sort = T)
    }
    # Clean up the common time vector and convert to POSIXct:
    tmat <- na.omit(unlist(tmat, use.names = F))
    tmat <- data.frame(as.POSIXct(tmat, origin = '1970-01-01 00:00:00'))
    # Preallocate binding variables:
    message(paste('Binding data... ', tags[h]))
    hout <- data.frame()
    # Set up NA counter:
    np <- numeric()
    tp <- numeric()
    # Bind data / tally NA counters
    # Will this loop work with columns of different names?
    for (i in hdat) {
      np <- append(np, sum(is.na(i)) - length(i))
      tp <- append(tp, length(i))
      browser()
      hout <- plyr::rbind.fill(hout, i)
    }
    # Sum NA tallys
    np <- sum(np, na.rm = T)
    tp <- sum(tp, na.rm = T)
    # Duplicated rows wreck things later. This isn't an ideal solution:
    dup <- hout$time[which(duplicated(hout$time) == T)]
    if (length(dup) > 0) hout <- hout[-which(hout$time %in% dup), ]
    if (sum(is.na(hout$time)) > 0) stop('NAs in time vec, needs debug')
    #
    tmat <- merge(x = tmat, y = hout, by = 1, all = T, sort = T)
    colnames(tmat)[1] <- 'time'
    # Fix column names. Won't work if the filenames from the fl vector
    # have column names that aren't identical.
    cn <- meta[which(meta['FILE'] == fl[1]), 'COLUMN_NAME']
    for (i in 2:length(fl)) {
      cn_t <- meta['COLUMN_NAME'][which(meta['FILE'] == fl[i])]
      if (cn_t != cn) stop('Column names on tagged files need to be identical')
    }
    ### USE METADATA TO CHECK COLUMN NAMES AGAINST TREATMENTS!
    ### This will remove the need for the 'signals'
    browser()
    colnames(tmat) <- cn
    if (ncol(dout) < 1) {
      dout <- data.frame(tmat[, 1])
      dout <- merge(x = dout, y = tmat, by = 1, all = T, sort = T)
      colnames(dout) <- cn
    } else {
      dout <- merge(x = dout, y = tmat, by = 1, all = T, sort = T)
    }
  }
  if (length(cf) < 1) stop('Failed to bind any files.')
  message('Wrapup...')
  fd <- abs(cf - length(sf))
  if (fd > 0) warning(cat(fd, 'files counted twice during merging.\n'))
  if (fd < 0) warning(cat(fd, 'files excluded by given tags.\n'))
  # Pull the NAs out from time vector and drop associated data
  tk <- nrow(dout)
  dr <- which(is.na(dout[, 1]))
  if (length(dr) > 0) dout <- dout[-which(is.na(dout[, 1])), ]
  if (nrow(dout) < (1 - tl) * tk) stop('Lost >1% of data from NAs in time vec.')
  time <- .POSIXct(integer())
  time <- dout[, 1]
  dout <- as.matrix(dout[, -1])
  data_tags <- as.list(colnames(dout))
  # Logging, clean-up and return ####
  no <- length(dout) - sum(is.na(dout))
  nc <- round(100 - (no / np * 100), 2)
  tc <- round(100 - (nrow(dout) / tp * 100), 2)
  flux@time <- time
  flux@data <- dout
  flux@data_tags <- data_tags
  if (nc > 0) {
    log_message <- paste(
      'BindRawFlux cut', np - no, 'points, or', nc, '% of total.')
    flux@log <- c(flux@log, log_message)
  }
  return(flux)
}
bmcnellis/sapflux documentation built on May 12, 2019, 10:27 p.m.