R/na_seadec.R
In imputeTS: Time Series Missing Value Imputation

Documented in na_seadec

#' @title Seasonally Decomposed Missing Value Imputation
#'
#' @description Removes the seasonal component from the time series,
#' performs imputation on the deseasonalized series and afterwards adds
#' the seasonal component again.
#'
#' @param x Numeric Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}})
#' object in which missing values shall be replaced
#' @param algorithm Algorithm to be used after decomposition.
#' Accepts the following input:
#' \itemize{
#'    \item{"interpolation" - Imputation by Interpolation} (default choice)
#'    \item{"locf" - Imputation by Last Observation Carried Forward}
#'    \item{"mean" - Imputation by Mean Value}
#'    \item{"random" - Imputation by Random Sample}
#'    \item{"kalman" - Imputation by Kalman Smoothing and State Space Models}
#'    \item{"ma" - Imputation by Weighted Moving Average}
#'    }
#' @param find_frequency If TRUE the algorithm will try to estimate the frequency
#' of the time-series automatically.
#'
#' @param maxgap Maximum number of successive NAs to still perform imputation on.
#'  Default setting is to replace all NAs without restrictions. With this
#'  option set, consecutive NAs runs, that are longer than 'maxgap' will
#'  be left NA. This option mostly makes sense if you want to
#'  treat long runs of NA afterwards separately.
#'
#' @param ... Additional parameters for these algorithms that can be passed
#' through. Look at \code{\link[imputeTS]{na_interpolation}},
#' \code{\link[imputeTS]{na_locf}}, \code{\link[imputeTS]{na_random}},
#' \code{\link[imputeTS]{na_mean}} for parameter options.
#'
#' @return Vector (\code{\link{vector}}) or Time Series (\code{\link{ts}})
#' object (dependent on given input at parameter x)
#'
#' @details The algorithm first performs a Seasonal Decomposition of Time Series by Loess
#' via \code{\link[stats]{stl}}. Decomposing the time series into seasonal, trend and irregular
#' components. The seasonal component gets then removed (subtracted) from the original series.
#' As a second step the selected imputation algorithm e.g. na_locf, na_ma, ...  is applied
#' on the deseasonalized series. Thus, the algorithm can work without being affected by seasonal
#' patterns. After filling the NA gaps, the seasonal component is added to the deseasonalized
#' series again.
#'
#' Implementation details:
#' A paper about the STL Decomposition procedure is linked in the references.
#' Since the function only works with complete data, the initial NA data is temporarily filled
#' via linear interpolation in order to perform the decomposition. These temporarily imputed
#' values are replaced with NAs again after obtaining the decomposition for the non-NA
#' observations. STL decomposition is run with robust = TRUE and s.window = 11. Additionally,
#' applying STL decomposition needs a preset frequency. This can be passed by the frequency
#' set in the input ts object or by setting 'find_frequency=TRUE' in order to find
#' an appropriate frequency for the time series. The find_frequency parameter internally uses
#' \code{\link[forecast]{findfrequency}}, which does a spectral analysis of the time series
#' for identifying a suitable frequency. Using find_frequency will update the previously set
#' frequency of a ts object to the newly found frequency. The default is 'find_frequency = FALSE',
#' which gives a warning if no seasonality is set for the supplied time series object.
#' If neither seasonality is set nor find_frequency is set to TRUE, the function goes on without
#' decomposition and just applies the selected secondary algorithm to the original time series
#' that still includes seasonality.
#'
#'
#' @references R. B. Cleveland, W. S. Cleveland, J.E. McRae, and I.
#' Terpenning (1990) STL: A Seasonal-Trend Decomposition Procedure
#' Based on Loess. Journal of Official Statistics, 6, 3–73.
#'
#' @author Steffen Moritz
#'
#' @seealso  \code{\link[imputeTS]{na_interpolation}},
#' \code{\link[imputeTS]{na_kalman}}, \code{\link[imputeTS]{na_locf}},
#'  \code{\link[imputeTS]{na_ma}}, \code{\link[imputeTS]{na_mean}},
#'  \code{\link[imputeTS]{na_random}}, \code{\link[imputeTS]{na_replace}},
#'  \code{\link[imputeTS]{na_seasplit}}
#'
#' @examples
#' # Example 1: Perform seasonal imputation using algorithm = "interpolation"
#' na_seadec(tsAirgap, algorithm = "interpolation")
#'
#' # Example 2: Perform seasonal imputation using algorithm = "mean"
#' na_seadec(tsAirgap, algorithm = "mean")
#'
#' # Example 3: Same as example 1, just written with pipe operator
#' tsAirgap %>% na_seadec(algorithm = "interpolation")
#' @importFrom stats frequency stl ts
#' @importFrom forecast findfrequency
#' @importFrom magrittr %>%
#' @export

na_seadec <- function(x, algorithm = "interpolation", find_frequency = FALSE, maxgap = Inf, ...) {

  # Variable 'data' is used for all transformations to the time series
  # 'x' needs to stay unchanged to be able to return the same ts class in the end
  data <- x


  #----------------------------------------------------------
  # Mulivariate Input
  # The next 20 lines are just for checking and handling multivariate input.
  #----------------------------------------------------------

  # Check if the input is multivariate
  if (!is.null(dim(data)[2]) && dim(data)[2] > 1) {
    # Go through columns and impute them by calling this function with univariate input
    for (i in 1:dim(data)[2]) {
      if (!anyNA(data[, i])) {
        next
      }
      # if imputing a column does not work - mostly because it is not numeric - the column is left unchanged
      tryCatch(
        data[, i] <- na_seadec(data[, i], algorithm, find_frequency, maxgap, ...),
        error = function(cond) {
          warning(paste(
            "na_seadec: No imputation performed for column", i, "of the input dataset.
                Reason:", cond[1]
          ), call. = FALSE)
        }
      )
    }
    return(data)
  }


  #----------------------------------------------------------
  # Univariate Input
  # All relveant imputation / pre- postprocessing  code is within this part
  #----------------------------------------------------------

  else {
    missindx <- is.na(data)

    ##
    ## 1. Input Check and Transformation
    ##


    # 1.1 Check if NAs are present
    if (!anyNA(data)) {
      return(x)
    }

    # 1.2 special handling data types
    if (any(class(data) == "tbl")) {
      data <- as.vector(as.data.frame(data)[, 1])
    }

    # 1.3 Check for algorithm specific minimum amount of non-NA values
    if (sum(!missindx) < 3) {
      stop("At least 3 non-NA data points required in the time series to apply na_seadec.")
    }

    # 1.4 Checks and corrections for wrong data dimension

    # Check if input dimensionality is not as expected
    if (!is.null(dim(data)[2]) && !dim(data)[2] == 1) {
      stop("Wrong input type for parameter x.")
    }

    # Altering multivariate objects with 1 column (which are essentially
    # univariate) to be dim = NULL
    if (!is.null(dim(data)[2])) {
      data <- data[, 1]
    }

    # 1.5 Check if input is numeric
    if (!is.numeric(data)) {
      stop("Input x is not numeric.")
    }

    # 1.6 Checks and corrections for time series frequency

    # Try to findFrequency
    if (find_frequency == TRUE) {
      t <- as.vector(data)
      freq <- forecast::findfrequency(na_interpolation(t))
      if (freq > 1) {
        data <- ts(t, frequency = freq)
      }
      else if (freq == 1) {
        warning("Option find_frequency = TRUE could not detect a seasonal pattern.
        The algorithm will go on without seasonal decomposition.
        You might consider manually setting a frequency by creating a time series with frequency information.
        Here is an example for weekly data: new_ts <- ts(old_ts, frequency = 7)")
        data <- apply_base_algorithm(data, algorithm = algorithm, ...)
        return(data)
      }
    }

    if (stats::frequency(data) == 1) {
      warning("No seasonality information for dataset could be found, going on without decomposition.
              Setting find_frequency=TRUE might be an option.")
      data <- apply_base_algorithm(data, algorithm = algorithm, ...)
      return(data)
    }

    if (length(data) < stats::frequency(data) * 2) {
      warning("More than 2 complete periods needed to perform a seasonal decomposition The algorithm will go on without seasonal decomposition.")
      data <- apply_base_algorithm(data, algorithm = algorithm, ...)
      return(data)
    }

    ##
    ## End Input Check and Transformation
    ##


    ##
    ## 2. Imputation Code
    ##

    # Interpolate NAs, to get complete series, because findFRequency and later on stl does not work with NAs
    temp <- na_interpolation(data)

    # temp (see above) is a interpolated version of data since stl does not work with NAs
    stl <- stats::stl(temp, robust = TRUE, s.window = 11)
    # just take trend component + irregular  (remove seasonality)
    ts_no_seasonality <- stl$time.series[, 2] + stl$time.series[, 3]

    # Fill in NAs again
    ts_no_seasonality[missindx] <- NA

    # Perform imputation on data without seasonality
    ts_no_seasonalityimputed <- apply_base_algorithm(ts_no_seasonality, algorithm = algorithm, ...)

    # add seasonality
    ts_imputed <- ts_no_seasonalityimputed + stl$time.series[, 1]

    # Merge interpolated values back into original time series
    data[missindx] <- ts_imputed[missindx]


    ##
    ## End Imputation Code
    ##


    ##
    ## 3. Post Processing
    ##

    # 3.1 Check for Maxgap option

    # If maxgap = Inf then do nothing and when maxgap is lower than 0
    if (is.finite(maxgap) && maxgap >= 0) {

      # Get logical vector of the time series via is.na() and then get the
      # run-length encoding of it. The run-length encoding describes how long
      # the runs of FALSE and TRUE are
      rlencoding <- rle(is.na(x))

      # Runs smaller than maxgap (which shall still be imputed) are set FALSE
      rlencoding$values[rlencoding$lengths <= maxgap] <- FALSE

      # The original vector is being reconstructed by reverse.rls, only now the
      # longer runs are replaced now in the logical vector derived from is.na()
      # in the beginning all former NAs that are > maxgap are also FALSE
      en <- inverse.rle(rlencoding)

      # Set all positions in the imputed series with gaps > maxgap to NA
      # (info from en vector)
      data[en == TRUE] <- NA
    }

    ##
    ## End Post Processing
    ##


    ##
    ## 4. Final Output Formatting
    ##

    # Give back the object originally supplied to the function
    # (necessary for multivariate input with only 1 column)
    if (!is.null(dim(x)[2])) {
      x[, 1] <- data
      return(x)
    }

    ##
    ## End Final Output Formatting
    ##

    return(data)
  }
}