R/AutoDropOutliers.R

#' @title Checks for outliers in flux data.
#'
#' @description
#'
#' Pre-processing sapflux data can be time consuming - while the whole process
#' hasn't been automated, some outliers are easy enough to pull automatically.
#'
#' @param flux         A 'flux' class object.
#' @param byIQR        Logical - use interquartile range to identify outliers?
#' @param drop_time    Drop empty time rows? TRUE or FALSE
#' @param max_time_gap Maximum time length in minute to interpolate.
#' @param min_data     Minimum amount of data a column needs to be included,
#'                     expressed as a percentage of total data.
#'
#' @details
#'
#' In order, AutoDropOutliers checks the following:
#'
#'  1) Are values above an 'absolute_maximum' defined in package metadata?
#'
#'  2) Are all values in a data column NA?
#'
#'  3) Do values occur before the probe install date, or after it's
#'     removal date?
#'
#'  4) Are there missing values in < 30 minute gaps? If so, fill them using
#'     \code{zoo::na.approx}.
#'
#' The byIQR method is an early implementation - it still has trouble
#' picking up on all but the most extreme outliers.
#'
#' @family preprocess
#' @export
#' @examples
#' # As of 0.1.9 - just use byIQR = FALSE, it needs improvement
#' flux_data <- AutoDropOutliers(flux = flux.data, byIQR = FALSE)
#' # Check the log to see how many data points got dropped:
#' print(flux_data@log)
AutoDropOutliers <- function(flux, byIQR = FALSE, drop_time = FALSE,
                             max_time_gap = 30, min_data = 5) {
  validObject(flux)
  params <- LoadDefaults(flux = flux)
  # Pull slots
  flux_data <- slot(object = flux, name = "data")
  data_tags <- slot(object = flux, name = "data_tags")
  time      <- slot(object = flux, name = "time")
  metadata  <- slot(object = flux, name = "metadata")
  datatype  <- slot(object = flux, name = "datatype")
  # Run param snips
  NA_prev <- sum(is.na(flux_data))
  message("Quick snip processing...")
  if (datatype == "voltages") {
    flux_data <- apply(flux_data, 2, abs)
  }
  mx <- as.numeric(params['maximum'])
  mn <- as.numeric(params['minimum'])
  flux_data[flux_data > mx] <- NA
  flux_data[flux_data < mn] <- NA
  # Whole-column drops:
  drops <- numeric()
  if (length(metadata) > 0) {
    drops <- c(drops, which(colnames(flux_data) %in% metadata$port_tag == FALSE))
  }
  if (min_data > 0) {
    message("Dropped columns with < ", min_data, "% viable data.", sep = "")
    drops <- c(drops, which(apply(
      flux_data, 2, function(x) sum(!is.na(x)) < (min_data / 100 * nrow(flux_data))
    )))
  }
  #drops <- c(drops, which(apply(flux_data, 2, function(x) all(is.na(x)))))
  NA_drop <- 0
  if (length(drops) > 1) {
    NA_drop <- sum(is.na(flux_data[, drops]))
    flux_data <- flux_data[, -drops]
    data_tags <- data_tags[-drops]
    #data_tags <- lapply(data_tags, function(x) x[-drops])
  }
  if (length(metadata) > 0) {
    # Fix metadata and get relevant time indicies
    meta_drops <- which(!(metadata$port_tag %in% unlist(data_tags)))
    if (length(meta_drops) > 0) {
      metadata <- metadata[-meta_drops, ]
    }
    repeat {
      start_date <- metadata$date_install
      start_date[which(is.na(start_date))] <- time[1]
      start_date[which(start_date < time[1])] <- time[1]
      end_date   <- metadata$date_removed
      end_date[which(is.na(end_date))] <- time[length(time)]
      end_date[which(end_date > time[length(time)])] <- time[length(time)]
      if (any(start_date > end_date)) {
        rowkills <- which(start_date > time[length(time)])
        rowkills <- append(rowkills, which(end_date < time[1]))
        if (length(rowkills) > 0) {
          metadata <- metadata[-rowkills, ]
        }
      } else {
        break
      }
    }
    stopifnot(start_date < end_date)
    # Subset by active date for each port
    ncols <- ncol(flux_data)
    tags <- unlist(data_tags)
    for (i in 1:length(tags)) {
      # Fix empty slots
      # Find relevant parameters
      start      <- start_date[which(metadata$port_tag == tags[i])]
      end        <- end_date[which(metadata$port_tag == tags[i])]
      data_sub   <- data.frame(time, flux_data[, i])
      data_sub   <- data_sub[which(time == start):which(time == end), ]
      # Subset the data
      suppressMessages(
        data_sub <- plyr::join(x = data.frame(time), y = data_sub)[, 2]
        )
      # I think this should be data_sub[, 2] ????
      flux_data[, i] <- data_sub
    }
  }
  # Approximate small gaps (< 30 minutes)
  if (max_time_gap > 0) {
    message(paste("Interpolated gaps of <", max_time_gap, "minutes."))
    maxgap <- difftime(time1 = time[2], time2 = time[1], units = "mins")
    maxgap <- floor(max_time_gap / as.numeric(maxgap))
    for (i in 1:ncol(flux_data)) {
      flux_data[, i] <- zoo::na.approx(flux_data[, i], maxgap = maxgap, na.rm = F)
    }
  } else {
    message("Did not interpolate data gaps.")
  }
  # Cut off empty rows from beginning/end of flux_data
  # Having the 'if' will speed up the function if nothing else.
  if (all(is.na(flux_data[1, ])) | all(is.na(flux_data[nrow(flux_data), ]))) {
    # If by version 1.0 everything works, kill the extra commented lines.
    #nix <- vector(mode = "logical", length = nrow(flux_data))
    nix <- apply(flux_data, 1, function(x) {
      all(is.na(x))
    })
    trim_beg <- which(nix == FALSE)[1]
    #if (all(is.na(flux_data[1, ]))) {
    #  trim_beg <- which(nix == FALSE)[1]
    #} else {
    #  trim_beg <- 1
    #}
    trim_end <- which(nix == FALSE)[length(which(nix == FALSE))]
    #if (all(is.na(flux_data[nrow(flux_data), ]))) {
    #  trim_end <- which(nix == FALSE)[length(which(nix == FALSE))]
    #} else {
    #  trim_end <- nrow(flux_data)
    #}
    flux_data <- flux_data[trim_beg:trim_end, ]
    time <- time[trim_beg:trim_end]
  }
  # Drops the intercalary missing rows
  if (drop_time == TRUE) {
    # Check 'time' for empties and trim the dataset/time vector to match
    # BEM: This could make the datasets a lot smaller, which is nice, but
    #      it may have unpredictable behavior later on. We'll see when it
    #      comes time to do the flux zeroing.
    time_drop <- apply(flux_data, 1, function(x) {
      all(is.na(x))
    })
    if (sum(time_drop) > 0) {
      time <- time[-which(time_drop == TRUE)]
      flux_data <- flux_data[-which(time_drop == TRUE), ]
    }
  }
  # byIQR method ####
  if (byIQR == TRUE) {
    stop("This method is deprecated/doesn't work.")
    message("Dropping outliers, IQR method...")
    #flux_data.out <- matrix(flux_data = NA, nrow = nrow(flux_data),
    #ncol = ncol(flux_data))
    for (i in 1:ncol(flux_data)) {
      range.sample <- rnorm(n = 100,
                            mean = mean(flux_data[, i], na.rm = TRUE),
                            sd = sd(flux_data[, i], na.rm = TRUE)
      )
      flux_data.out[, i] <- ifelse(
        is.na(flux_data[, i]), sample(range.sample, 1) , flux_data[, i]
      )
      flux_data.out[, i] <- as.ts(flux_data.out[, i])
      index <- 1:length(flux_data.out[, i])
      resids <- residuals(loess(flux_data.out[, i] ~ index))
      quants <- quantile(resids, prob=c(0.25, 0.75))
      iqr <- diff(quants)
      limits <- quants + 1.5 * iqr * c(-1, 1)
      flux_data.out[, i] <- abs(pmin((resids - limits[1]) / iqr, 0) +
                             pmax((resids - limits[2]) / iqr, 0))
      cat(round(i / ncol(flux_data) * 100, digits = 0), "% complete\n")
    }
    flux_data <- ifelse(flux_data.out < 10, yes = flux_data, no = NA)
  }
  # Clean-up and return! ####
  #NA_post <- length(flux_data) - table(is.na(flux_data))[["FALSE"]]
  NA_diff <- round((sum(is.na(flux_data)) + NA_drop) - NA_prev, 2)
  log_message <- paste("Dropped", NA_diff, "data points during automatic",
                       "outlier processing.")
  if (length(drops) > 0) {
    log_message <- paste(log_message, paste(
        "Of these,", NA_drop,
        "were in columns that were eventually totally removed - ",
        round((NA_drop / NA_diff) * 100, 1),
        "% of total."
        ), collapse = " ")
  }
  slot(flux, "log") <- c(slot(flux, "log"), log_message)
  slot(flux, "time") <- time
  slot(flux, "data") <- flux_data
  slot(flux, "metadata") <- metadata
  slot(flux, "data_tags") <- data_tags
  validObject(flux)
  return(flux)
}
bmcnellis/sapflux documentation built on May 12, 2019, 10:27 p.m.