#' @title Checks for outliers in flux data.
#'
#' @description
#'
#' Pre-processing sapflux data can be time consuming - while the whole process
#' hasn't been automated, some outliers are easy enough to pull automatically.
#'
#' @param flux A 'flux' class object.
#' @param byIQR Logical - use interquartile range to identify outliers?
#' @param drop_time Drop empty time rows? TRUE or FALSE
#' @param max_time_gap Maximum time length in minute to interpolate.
#' @param min_data Minimum amount of data a column needs to be included,
#' expressed as a percentage of total data.
#'
#' @details
#'
#' In order, AutoDropOutliers checks the following:
#'
#' 1) Are values above an 'absolute_maximum' defined in package metadata?
#'
#' 2) Are all values in a data column NA?
#'
#' 3) Do values occur before the probe install date, or after it's
#' removal date?
#'
#' 4) Are there missing values in < 30 minute gaps? If so, fill them using
#' \code{zoo::na.approx}.
#'
#' The byIQR method is an early implementation - it still has trouble
#' picking up on all but the most extreme outliers.
#'
#' @family preprocess
#' @export
#' @examples
#' # As of 0.1.9 - just use byIQR = FALSE, it needs improvement
#' flux_data <- AutoDropOutliers(flux = flux.data, byIQR = FALSE)
#' # Check the log to see how many data points got dropped:
#' print(flux_data@log)
AutoDropOutliers <- function(flux, byIQR = FALSE, drop_time = FALSE,
max_time_gap = 30, min_data = 5) {
validObject(flux)
params <- LoadDefaults(flux = flux)
# Pull slots
flux_data <- slot(object = flux, name = "data")
data_tags <- slot(object = flux, name = "data_tags")
time <- slot(object = flux, name = "time")
metadata <- slot(object = flux, name = "metadata")
datatype <- slot(object = flux, name = "datatype")
# Run param snips
NA_prev <- sum(is.na(flux_data))
message("Quick snip processing...")
if (datatype == "voltages") {
flux_data <- apply(flux_data, 2, abs)
}
mx <- as.numeric(params['maximum'])
mn <- as.numeric(params['minimum'])
flux_data[flux_data > mx] <- NA
flux_data[flux_data < mn] <- NA
# Whole-column drops:
drops <- numeric()
if (length(metadata) > 0) {
drops <- c(drops, which(colnames(flux_data) %in% metadata$port_tag == FALSE))
}
if (min_data > 0) {
message("Dropped columns with < ", min_data, "% viable data.", sep = "")
drops <- c(drops, which(apply(
flux_data, 2, function(x) sum(!is.na(x)) < (min_data / 100 * nrow(flux_data))
)))
}
#drops <- c(drops, which(apply(flux_data, 2, function(x) all(is.na(x)))))
NA_drop <- 0
if (length(drops) > 1) {
NA_drop <- sum(is.na(flux_data[, drops]))
flux_data <- flux_data[, -drops]
data_tags <- data_tags[-drops]
#data_tags <- lapply(data_tags, function(x) x[-drops])
}
if (length(metadata) > 0) {
# Fix metadata and get relevant time indicies
meta_drops <- which(!(metadata$port_tag %in% unlist(data_tags)))
if (length(meta_drops) > 0) {
metadata <- metadata[-meta_drops, ]
}
repeat {
start_date <- metadata$date_install
start_date[which(is.na(start_date))] <- time[1]
start_date[which(start_date < time[1])] <- time[1]
end_date <- metadata$date_removed
end_date[which(is.na(end_date))] <- time[length(time)]
end_date[which(end_date > time[length(time)])] <- time[length(time)]
if (any(start_date > end_date)) {
rowkills <- which(start_date > time[length(time)])
rowkills <- append(rowkills, which(end_date < time[1]))
if (length(rowkills) > 0) {
metadata <- metadata[-rowkills, ]
}
} else {
break
}
}
stopifnot(start_date < end_date)
# Subset by active date for each port
ncols <- ncol(flux_data)
tags <- unlist(data_tags)
for (i in 1:length(tags)) {
# Fix empty slots
# Find relevant parameters
start <- start_date[which(metadata$port_tag == tags[i])]
end <- end_date[which(metadata$port_tag == tags[i])]
data_sub <- data.frame(time, flux_data[, i])
data_sub <- data_sub[which(time == start):which(time == end), ]
# Subset the data
suppressMessages(
data_sub <- plyr::join(x = data.frame(time), y = data_sub)[, 2]
)
# I think this should be data_sub[, 2] ????
flux_data[, i] <- data_sub
}
}
# Approximate small gaps (< 30 minutes)
if (max_time_gap > 0) {
message(paste("Interpolated gaps of <", max_time_gap, "minutes."))
maxgap <- difftime(time1 = time[2], time2 = time[1], units = "mins")
maxgap <- floor(max_time_gap / as.numeric(maxgap))
for (i in 1:ncol(flux_data)) {
flux_data[, i] <- zoo::na.approx(flux_data[, i], maxgap = maxgap, na.rm = F)
}
} else {
message("Did not interpolate data gaps.")
}
# Cut off empty rows from beginning/end of flux_data
# Having the 'if' will speed up the function if nothing else.
if (all(is.na(flux_data[1, ])) | all(is.na(flux_data[nrow(flux_data), ]))) {
# If by version 1.0 everything works, kill the extra commented lines.
#nix <- vector(mode = "logical", length = nrow(flux_data))
nix <- apply(flux_data, 1, function(x) {
all(is.na(x))
})
trim_beg <- which(nix == FALSE)[1]
#if (all(is.na(flux_data[1, ]))) {
# trim_beg <- which(nix == FALSE)[1]
#} else {
# trim_beg <- 1
#}
trim_end <- which(nix == FALSE)[length(which(nix == FALSE))]
#if (all(is.na(flux_data[nrow(flux_data), ]))) {
# trim_end <- which(nix == FALSE)[length(which(nix == FALSE))]
#} else {
# trim_end <- nrow(flux_data)
#}
flux_data <- flux_data[trim_beg:trim_end, ]
time <- time[trim_beg:trim_end]
}
# Drops the intercalary missing rows
if (drop_time == TRUE) {
# Check 'time' for empties and trim the dataset/time vector to match
# BEM: This could make the datasets a lot smaller, which is nice, but
# it may have unpredictable behavior later on. We'll see when it
# comes time to do the flux zeroing.
time_drop <- apply(flux_data, 1, function(x) {
all(is.na(x))
})
if (sum(time_drop) > 0) {
time <- time[-which(time_drop == TRUE)]
flux_data <- flux_data[-which(time_drop == TRUE), ]
}
}
# byIQR method ####
if (byIQR == TRUE) {
stop("This method is deprecated/doesn't work.")
message("Dropping outliers, IQR method...")
#flux_data.out <- matrix(flux_data = NA, nrow = nrow(flux_data),
#ncol = ncol(flux_data))
for (i in 1:ncol(flux_data)) {
range.sample <- rnorm(n = 100,
mean = mean(flux_data[, i], na.rm = TRUE),
sd = sd(flux_data[, i], na.rm = TRUE)
)
flux_data.out[, i] <- ifelse(
is.na(flux_data[, i]), sample(range.sample, 1) , flux_data[, i]
)
flux_data.out[, i] <- as.ts(flux_data.out[, i])
index <- 1:length(flux_data.out[, i])
resids <- residuals(loess(flux_data.out[, i] ~ index))
quants <- quantile(resids, prob=c(0.25, 0.75))
iqr <- diff(quants)
limits <- quants + 1.5 * iqr * c(-1, 1)
flux_data.out[, i] <- abs(pmin((resids - limits[1]) / iqr, 0) +
pmax((resids - limits[2]) / iqr, 0))
cat(round(i / ncol(flux_data) * 100, digits = 0), "% complete\n")
}
flux_data <- ifelse(flux_data.out < 10, yes = flux_data, no = NA)
}
# Clean-up and return! ####
#NA_post <- length(flux_data) - table(is.na(flux_data))[["FALSE"]]
NA_diff <- round((sum(is.na(flux_data)) + NA_drop) - NA_prev, 2)
log_message <- paste("Dropped", NA_diff, "data points during automatic",
"outlier processing.")
if (length(drops) > 0) {
log_message <- paste(log_message, paste(
"Of these,", NA_drop,
"were in columns that were eventually totally removed - ",
round((NA_drop / NA_diff) * 100, 1),
"% of total."
), collapse = " ")
}
slot(flux, "log") <- c(slot(flux, "log"), log_message)
slot(flux, "time") <- time
slot(flux, "data") <- flux_data
slot(flux, "metadata") <- metadata
slot(flux, "data_tags") <- data_tags
validObject(flux)
return(flux)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.