HTSoutliers: Hrydological and Time Series Outlier Detection Methods

Documented in loess_model

#' Fits loess curve to each station in a data set
#'
#' Takes a data set with \code{ID}, \code{DATE} and \code{SNWD} (use
#' \code{\link{create_model_variables}}), and adds a column with a loess curve
#' upper estimate, and a column with 1 for observations which are more than 3.5
#' SD above the loess estimate.
#'
#' The data set can have an arbitrary number of stations, and a loess curve will
#' be fit for each station. For each station, each observation's \code{DATE} is
#' converted to a day of the year. So '1999-1-1' will be \code{DOY}=0,
#' '2004-1-1' will also be \code{DOY}=0. This effectively collapses each
#' stations data into one year where each day will have every years observation.
#' \code{msir::loess} is then fit with the given \code{span}, \code{nsigma}, and
#' \code{family} parameters. Some stations don't have enough data, or loess
#' simply won't converge, in these cases the default value returned for
#' \code{loess_outlier} is 1, and for \code{loess_upper} \code{NA} is returned.
#'
#'
#' @param data A data set with at least ID, DATE, and SNWD (use
#'   \code{\link{create_model_variables}}).
#' @param span Default is .1, passed to \code{msir::loess_sd}.
#' @param nsigma Default is 3.5, is the number of standard deviations to use
#'   when finding the upper bound from the loess curve.
#' @param family Default symmetric, passed to \code{msir::loess_sd}.
#' @param progress True if you want a progress bar.
#' @param ... extra parameters are passed to msir::loess_sd.
#'
#' @return A copy of the data set with \code{loess_upper} (for plotting) and
#'   \code{loess_outlier} (1 for outlier, 0 for not) added.
#' @export
#'
#' @importFrom dplyr .data
#'
#' @examples
#' \dontrun{
#' # Some test station id's
#'
#' test_ids <- c("US1COBO0074", "US1COBO0024",
#'               "US1COAR0114", "US1COLR0758",
#'               "USC00050437", "USC00053038",
#'               "US1COAD0098", "USW00023066",
#'               "USC00051179", "US1COLR0187",
#'               "US1CODG0079", "US1CODG0033",
#'               "US1COBO0266", "USC00052501",
#'               "US1COPU0080", "USC00056410",
#'               "US1COLR0520", "US1COLR0157",
#'               "US1COLR0142", "USC00056012")
#'
#' # creates a testing data set of colorado data
#' test_data <- get_weather_data(test_ids)
#' test_data <- create_flagged_dataset(test_data)
#' test_data <- create_model_variables(test_data)
#' test_data$ID <- as.character(test_data$ID)
#'
#' # fits loess models on all the testing data from colorado
#' test_data <- loess_model(test_data)
#'
#' # creates a confusion matrix to assess performance
#' caret::confusionMatrix(test_data$loess_outlier,
#'                        test_data$OUTLIER_FINAL,
#'                        positive="1")
#'
#' }
#'
loess_model <- function(data, span = .1, nsigma = 3.5,
                        family = "symmetric", progress = TRUE, ...) {

  # Makes sure that the data only has SNWD
  DOY <- NULL
  DOY_RAW <- NULL
  data <- data %>% dplyr::mutate(DOY_RAW = lubridate::yday(.data$DATE) - 1,
                                 DOY = base::ifelse(DOY_RAW >= 182,  # 182 is July 1st
                                                    DOY_RAW - 366,
                                                    DOY_RAW)) %>%
    dplyr::select(-DOY_RAW)

  # Loops through all the stations
  stations <- base::unique(data$ID)
  results <- NULL;
  i <- 0;
  for (station in 1:length(stations)) {
    if (progress) {
      svMisc::progress(station,
                       max.value = length(stations),
                       progress.bar = TRUE)
    }

    new_data <- dplyr::filter(data, .data$ID == stations[station])

    base::tryCatch({
      # train loes model
      model_full <- msir::loess.sd(new_data$DOY, new_data$SNWD,
                                   span = span, nsigma = nsigma,
                                   family = family, ...)
      # find prediction upper limit
      loess_upper <- NULL
      loess_outlier <- NULL
      prediction <- base::data.frame(DOY = model_full$x,
                                     loess_upper = model_full$upper) %>%
        dplyr::distinct(.keep_all = TRUE)
      # create the outlier flag
      new_data <- dplyr::left_join(new_data, prediction, by = "DOY")
    },
    error = function(c) {
      new_data <- dplyr::mutate(new_data,
                                loess_upper = -1)
    }
  )

    if (!base::is.null(results)) {
      results <- dplyr::bind_rows(results, new_data)
    }
    else {
      results <- new_data
    }
  }

  results$loess_upper <- base::ifelse(base::is.na(results$loess_upper),
                                      -1,
                                      results$loess_upper)

  results$loess_outlier <- base::factor((results$SNWD > results$loess_upper),
                                  levels = c(FALSE, TRUE),
                                  labels = c("0", "1"))

  return(results)
}