R/twoStepsBenchmark.R

Defines functions reUseBenchmark annualBenchmark twoStepsBenchmark get_constant_indic twoStepsBenchmark_impl eval_smoothed_part coefficients_application regression_estimation residuals_extrap residuals_extrap_sequence interpret_outliers cbind_outliers minmax_tsp split_outlier_names

Documented in annualBenchmark residuals_extrap reUseBenchmark twoStepsBenchmark

split_outlier_names <- function(outlier_strings) {
  
  str <- regmatches(outlier_strings,
                    regexec("^(AO|LS)([0-9]+?)(?:T([0-9]+?))?$",
                            outlier_strings))
  
  if (any(lengths(str) == 0L)) stop("The outlier names can't be interpreted (see ?twoStepsBenchmark)",
                                    call. = FALSE)
  
  structure(
    lapply(str, function(x) list(type=x[2L],
                                 year=as.integer(x[3L]),
                                 cycle=if (identical(x[4L],"")) 1L else as.integer(x[4L]))),
    names = outlier_strings)
  
}

minmax_tsp <- function(ts_list) {
  
  tsp_list <- vapply(ts_list,tsp,FUN.VALUE = c(0,0,0))
  
  c(min(tsp_list[1L,]),
    max(tsp_list[2L,]))
  
}

cbind_outliers <- function(outliers_ts_list,start,end) {
  
  minmax <- minmax_tsp(outliers_ts_list)
  minmax[1L] <- min(minmax[1L],start)
  minmax[2L] <- max(minmax[2L],end)
  
  hffreq <- frequency(outliers_ts_list[[1L]])
  
  res <- window(
    ts(
      vapply(
        outliers_ts_list,
        function(serie) {
          
          tspser <- tsp(serie)
          
          c(rep(0,round((tspser[1L]-minmax[1L])*hffreq)),
            as.numeric(serie),
            rep(switch(attr(serie,"type"),
                       AO = 0,
                       LS = serie[length(serie)]),
                round((minmax[2L]-tspser[2L])*hffreq)))
          
        },
        FUN.VALUE = rep(0,round((minmax[2L]-minmax[1L])*hffreq)+1L)
      ),
      start = minmax[1L],
      frequency = hffreq),
    start = start,
    end = end,
    extend = TRUE)
  
  res
}

interpret_outliers <- function(outliers,lffreq,tsphf) {
  
  if (is.null(outliers)) return()
  
  if (!inherits(outliers,"list")) stop("The outliers must be a named list (see ?twoStepsBenchmark)",
                                       call. = FALSE)
  
  if (is.null(names(outliers))) stop("The outliers list must have names (see ?twoStepsBenchmark)",
                                     call. = FALSE)
  
  ratio <- tsphf[3L]/lffreq
  
  outliers_ts_list <-
    Map(
      function(splitted_name,vect) {
        if (length(vect) %% ratio != 0L) stop("The outlier vector must be of length k * hf/lf",
                                              call. = FALSE)
        
        res <- structure(
          ts(as.numeric(vect),
             start = splitted_name$year + (splitted_name$cycle - 1L) / lffreq,
             frequency = tsphf[3L]),
          type = splitted_name$type
        )
        
        if (is.ts(vect) && !tsp_equal(tsp(vect),tsp(res)))
          stop("The outlier list contains time series whose windows or frequencies are inconsistent",
               call. = FALSE)
        
        res
        
      },
      split_outlier_names(names(outliers)),
      outliers)
  
  res <- cbind_outliers(
    outliers_ts_list,
    start = tsphf[1L],
    end = tsphf[2L]
  )
  
  res
  
}

residuals_extrap_sequence <- function(u0,u1,rho,n,include.differenciation) {
  if (include.differenciation) {
    if (rho == 1) u1 + (u1-u0) * (1:n)
    else (u1-u0)*(1-rho^(2:(n+1)))/(1-rho)+u0
  }
  else u1*(rho^(1:n))
}

#' Extrapolation function for the residuals in a twoStepsBenchmark
#' 
#' This function is the rule to extrapolate the low-frequency residuals.
#' If include.differenciation is `TRUE`, u(n+1)-u(n) = rho*(u(n)-u(n-1))
#' Else u(n+1) = rho * u(n)
#'
#' @param lfresiduals the residuals to extrapolate
#' @param rho the autocorrelation parameter of the regression
#' @param include.differenciation a boolean, the same as submitted
#' to twoStepsBenchmark
#'
#' @return a numeric, the extrapolated sequence of residuals, to replace the NA of
#' the residuals
#' @keywords internal
#' @export
residuals_extrap <- function(lfresiduals,rho,include.differenciation) {
  valplaces <- which(!is.na(lfresiduals))
  if (length(valplaces) != 0) {
    firstval <- valplaces[1L]
    lastval <- valplaces[length(valplaces)]
    if (rho==0) rhoinverse <- 0 else rhoinverse <- 1/rho
    if (lastval != length(lfresiduals)) {
      lfresiduals[(lastval+1L):length(lfresiduals)] <-
        residuals_extrap_sequence(lfresiduals[lastval-1L],
                                  lfresiduals[lastval],
                                  rho,
                                  length(lfresiduals)-lastval,
                                  include.differenciation)
    }
    if (firstval != 1L) {
      lfresiduals[(firstval-1L):1L] <-
        residuals_extrap_sequence(lfresiduals[firstval+1L],
                                  lfresiduals[firstval],
                                  rhoinverse,
                                  firstval-1L,
                                  include.differenciation)
    }
  }
  lfresiduals
}

regression_estimation <- function(hfserie,lfserie,
                                  include.differenciation,include.rho,
                                  start.coeff.calc,end.coeff.calc,
                                  set_coefficients,cl) {
  y <- window(lfserie,start=start.coeff.calc,end=end.coeff.calc,extend = TRUE)
  
  x <- aggregate_and_crop_hf_to_lf(hfserie,y)
  
  praislm(x,y,include.rho,include.differenciation,set_coefficients,cl)
}

coefficients_application <- function(hfserie,lfserie,regcoefs) {
  
  tsp_extended <- extend_tsp(tsp(hfserie),frequency(lfserie))
  
  hfserie_win <- window(hfserie,start=tsp_extended[1L],end=tsp_extended[2L],extend = TRUE)
  
  ts_from_tsp(as.numeric(hfserie_win %*% regcoefs),tsp(hfserie_win))
  
}

eval_smoothed_part <- function(hfserie_fitted,lfserie,include.differenciation,rho,set.smoothed.part) {
  if (is.null(set.smoothed.part)) {
    hfserie_fitted_aggreg <- fast_aggregate(hfserie_fitted,nfrequency = frequency(lfserie))
    lfresiduals <- fast_op_on_x(
      hfserie_fitted_aggreg,
      lfserie,
      function(e1,e2) e2-e1)
    lfresiduals <- residuals_extrap(lfresiduals,rho,include.differenciation)
    bflSmooth(lfresiduals,frequency(hfserie_fitted))
  }
  else {
    if (!is.ts(set.smoothed.part) || is.mts(set.smoothed.part)) stop("set.smoothed part must be an univariate time-serie", call. = FALSE)
    set.smoothed.part
  }
}

#' @include s4register.R
twoStepsBenchmark_impl <- function(hfserie,lfserie,
                                   include.differenciation,include.rho,
                                   set_coefficients,
                                   start.coeff.calc,end.coeff.calc,
                                   start.benchmark,end.benchmark,
                                   start.domain,end.domain,
                                   maincl,cl=NULL,set.smoothed.part=NULL) {
  if (is.null(cl)) cl <- maincl
  
  regresults     <- regression_estimation(hfserie,lfserie,
                                          include.differenciation,include.rho,
                                          start.coeff.calc,end.coeff.calc,
                                          set_coefficients,cl)
  
  hfserie_cropped <- window(hfserie,start=start.domain,end=end.domain,extend=TRUE)
  
  lfserie_cropped <- window(lfserie,start=start.benchmark,end=end.benchmark,extend=TRUE)
  
  hfserie_fitted  <- coefficients_application(hfserie_cropped,lfserie_cropped,regresults$coefficients)
  
  smoothed_part   <- eval_smoothed_part(hfserie_fitted,lfserie_cropped,include.differenciation,regresults$rho,set.smoothed.part)
  
  rests <- fast_op_on_x(hfserie_fitted,
                        smoothed_part,
                        `+`)
  
  tsp_cropped <- tsp(hfserie_cropped)
  res <- list(benchmarked.serie = window(rests,start=tsp_cropped[1L],end=tsp_cropped[2L],extend = TRUE),
              fitted.values = window(hfserie_fitted,start=tsp_cropped[1L],end=tsp_cropped[2L],extend = TRUE),
              regression = regresults,
              smoothed.part = smoothed_part,
              model.list = c(list(hfserie = hfserie,
                                  lfserie = lfserie,
                                  include.rho = include.rho,
                                  include.differenciation = include.differenciation,
                                  set.coefficients = set_coefficients,
                                  start.coeff.calc = start.coeff.calc,
                                  end.coeff.calc = end.coeff.calc,
                                  start.benchmark = start.benchmark,
                                  end.benchmark = end.benchmark,
                                  start.domain = start.domain,
                                  end.domain = end.domain),
                             if (!is.null(set.smoothed.part)) list(set.smoothed.part=set.smoothed.part)),
              call = cl)
  
  new("twoStepsBenchmark",res)
}

# The arbitrary constant is only intended to set a common zero for the
# integrated constant between the different benchmarks. This constant doesn't
# impact the benchmarks, as the results are the same no matter the constant.
# Through, it allows reUseBenchmark to work with series starting at different
# times
get_constant_indic <- function(nrow, hf, lf, include.differenciation, start.hfserie) {
  if (include.differenciation) {
    arbitrary_constant <- ((2*(start.hfserie - 2000)*lf-1)*hf/lf-1)/2
    (arbitrary_constant + seq_len(nrow)) * (lf/hf)^2
  }
  else rep(lf/hf,nrow)
}

#' @title Regress and bends a time series with a lower frequency one
#' 
#' @description twoStepsBenchmark bends a time series with a time series of a lower frequency.
#' The procedure involved is a Prais-Winsten regression, then an additive
#' Denton benchmark.
#' 
#' Therefore, the resulting time series is the sum of a regression fit and of a
#' smoothed part. The smoothed part minimizes the sum of squares of its
#' differences.
#' 
#' The resulting time series is equal to the low-frequency series after aggregation
#' within the benchmark window.
#'
#' @details annualBenchmark is a wrapper of the main function, that applies more specifically
#' to annual series, and changes the default window parameters to the ones
#' that are commonly used by quarterly national accounts.
#'   
#' @aliases annualBenchmark
#' @usage
#' twoStepsBenchmark(hfserie,lfserie,include.differenciation=FALSE,include.rho=FALSE,
#'                   set.coeff=NULL,set.const=NULL,
#'                   start.coeff.calc=NULL,end.coeff.calc=NULL,
#'                   start.benchmark=NULL,end.benchmark=NULL,
#'                   start.domain=NULL,end.domain=NULL,outliers=NULL,
#'                   ...)
#'
#'
#' annualBenchmark(hfserie,lfserie,
#'                 include.differenciation=FALSE,include.rho=FALSE,
#'                 set.coeff=NULL,set.const=NULL,
#'                 start.coeff.calc=start(lfserie)[1L],
#'                 end.coeff.calc=end(lfserie)[1L],
#'                 start.benchmark=start(lfserie)[1L],
#'                 end.benchmark=end.coeff.calc[1L]+1L,
#'                 start.domain=start(hfserie),
#'                 end.domain=c(end.benchmark[1L]+2L,frequency(hfserie)),
#'                 outliers=NULL)
#' 
#' @param hfserie the bended time series. It can be a matrix time series.
#' @param lfserie a time series whose frequency divides the frequency of `hfserie`.
#' @param include.differenciation a boolean of length 1. If `TRUE`, `lfserie` and
#' `hfserie` are differentiated before the estimation of the regression.
#' @param include.rho a boolean of length 1. If `TRUE`, the regression includes
#' an autocorrelation parameter for the residuals. The applied procedure is a
#' Prais-Winsten estimation.
#' @param set.coeff an optional numeric, that allows the user to set the
#' regression coefficients instead of evaluating them.
#' If hfserie is not a matrix, set.coeff can be an unnamed numeric of length 1.
#' Otherwise, `set.coeff` has to be a named numeric, which will set the
#' corresponding coefficients instead of evaluating them.
#' Each column name of hfserie and each outlier set with the `outlier` arg
#' initialize a coefficient with the same name, that can be set through set.coeff.
#' The default name for a non-matrix time series is then `"hfserie"`,
#' By example, a LS2003 and the time series can be set using
#' `set.coeff=c(hfserie=3,LS2003=1)`.
#' @param set.const an optional numeric of length 1, that sets the regression
#' constant.
#' The constant is actually an automatically added column to `hfserie`. Using
#' `set.constant=3` is equivalent to using `set.coeff=c(constant=3)`.
#' @param start.coeff.calc an optional start for the estimation of the
#' coefficients of the regression.
#' Should be a numeric of length 1 or 2, like a window for `lfserie`. If NULL,
#' the start is defined by lfserie's window.
#' @param end.coeff.calc an optional end for the estimation of the coefficients
#' of the regression.
#' Should be a numeric of length 1 or 2, like a window for `lfserie`. If NULL,
#' the end is defined by lfserie's window.
#' @param start.benchmark an optional start for `lfserie` to bend `hfserie`.
#' Should be a numeric of length 1 or 2, like a window for `lfserie`. If NULL,
#' the start is defined by lfserie's window.
#' @param end.benchmark an optional end for `lfserie` to bend `hfserie`.
#' Should be a numeric of length 1 or 2, like a window for `lfserie`. If NULL,
#' the start is defined by lfserie's window.
#' @param start.domain an optional for the output high-frequency series. It also
#' defines the smoothing window :
#' The low-frequency residuals will be extrapolated until they contain the
#' smallest low-frequency window that is around the high-frequency domain
#' window.
#' Should be a numeric of length 1 or 2, like a window for `hfserie`. If NULL,
#' the start is defined by hfserie's window.
#' @param end.domain an optional end for the output high-frequency series. It
#' also defines the smoothing window :
#' The low-frequency residuals will be extrapolated until they contain the
#' smallest low-frequency window that is around the high-frequency domain
#' window.
#' Should be a numeric of length 1 or 2, like a window for `hfserie`. If NULL,
#' the start is defined by hfserie's window.
#' @param outliers an optional named list of numeric vectors, whose pattern is
#' like `list(AO2008T2=c(0,0,3,2),LS2002=c(0.1,0.1,0.1,0.1))` where :
#' 
#' * `"AO"` stands for additive outlier or `"LS"` for level shift
#' * The integer that follows stands for the outlier starting year
#' * an optional integer, preceded by the letter T, stands for the low-frequency
#' cycle of the outlier start.
#' * The numeric vector values stands for the disaggregated value of the outlier
#' and its length must be a multiple of hf / lf
#' 
#' The outliers coefficients are evaluated though the regression process, like
#' any coefficient. Therefore, if any outlier is outside of the coefficient
#' calculation window, it should be fixed using `set.coeff`.
#' 
#' @param \dots if the dots contain a cl item, its value overwrites the value of
#' the returned call. This feature allows to build wrappers.
#' @return
#' twoStepsBenchark returns an object of class "`twoStepsBenchmark`".
#' 
#' The function `summary` can be used to obtain and print a summary of the
#' regression used by the benchmark.
#' The functions `plot` and `autoplot` (the generic from \pkg{ggplot2}) produce
#' graphics of the benchmarked serie and the bending serie.
#' The functions \link{in_disaggr}, \link{in_revisions}, \link{in_scatter}
#' produce comparisons on which plot and autoplot can also be used.
#' 
#' The generic accessor functions `as.ts`, `prais`, `coefficients`, `residuals`,
#' `fitted.values`, `model.list`, `se`, `rho` extract various useful features of
#' the returned value.
#' 
#' An object of class "`twoStepsBenchmark`" is a list containing the following
#' components :
#'   \item{benchmarked.serie}{a time series, that is the result of the
#'   benchmark. It is equal to `fitted.values + smoothed.part`.}
#'   \item{fitted.values}{a time series, that is the high-frequency series as it
#'   is after having applied the regression coefficients. Compared to the fitted
#'   values of the regression, which can be retrieved inside the regression
#'   component, it has a high-frequency time series and can eventually be
#'   integrated if `include.differenciation` is `TRUE`.}
#'   \item{regression}{an object of class praislm, it is the regression on which
#'   relies the benchmark. It can be extracted with the function \link{prais}}
#'   \item{smoothed.part}{the smoothed part of the two-steps benchmark. It is
#'   the smoothed difference between the `fitted.values` and lfserie.}
#'   \item{model.list}{a list containing all the arguments submitted to the
#'   function.}
#'   \item{call}{the matched call (either of twoStepsBenchmark or
#'   annualBenchmark)}
#'   
#' @examples
#' 
#' ## How to use annualBenchmark or twoStepsBenchark
#' 
#' benchmark <- twoStepsBenchmark(hfserie = turnover,
#'                                lfserie = construction,
#'                                include.differenciation = TRUE)
#' as.ts(benchmark)
#' coef(benchmark)
#' summary(benchmark)
#' library(ggplot2)
#' autoplot(in_sample(benchmark))
#' 
#' ## How to manually set the coefficient
#' 
#' benchmark2 <- twoStepsBenchmark(hfserie = turnover,
#'                                 lfserie = construction,
#'                                 include.differenciation = TRUE,
#'                                 set.coeff = 0.1)
#' coef(benchmark2)
#'
#' @export
twoStepsBenchmark <- function(hfserie,lfserie,
                              include.differenciation=FALSE,include.rho=FALSE,
                              set.coeff=NULL,set.const=NULL,
                              start.coeff.calc=NULL,end.coeff.calc=NULL,
                              start.benchmark=NULL,end.benchmark=NULL,
                              start.domain=NULL,end.domain=NULL,
                              outliers=NULL,...) {
  
  if ( !is.ts(lfserie) || !is.ts(hfserie) ) stop("Not a ts object",
                                                 call. = FALSE)
  
  if (is.matrix(hfserie) &&
      is.null(colnames(hfserie)) &&
      ncol(hfserie) != 1L) stop("The high-frequency mts must have column names", call. = FALSE)
  
  hfserie <- clean_tsp(hfserie)
  lfserie <- clean_tsp(lfserie)
  
  tsplf <- tsp(lfserie)
  tsphf <- tsp(hfserie)
  
  if (tsphf[3L] %% tsplf[3L] != 0L) stop("The low frequency should divide the higher one", call. = FALSE)
  if (!is.null(dim(lfserie)) && dim(lfserie)[2L] != 1) stop("The low frequency series must be one-dimensional", call. = FALSE)
  
  maincl <- match.call()
  
  if (is.null(set.coeff)) set.coeff <- numeric()
  if (is.null(set.const)) set.const <- numeric()
  
  constant <- get_constant_indic(NROW(hfserie), tsphf[3L], tsplf[3L],
                                 include.differenciation, tsphf[1L])
  
  if (length(set.const) > 1L) stop("set.const must be a single value", call. = FALSE)
  if (length(set.const) == 1L) names(set.const) <- "constant"
  
  if ((NCOL(hfserie) == 1L) &&
      length(set.coeff) == 1L &&
      is.null(names(set.coeff)))
    names(set.coeff) <- "hfserie"
  
  outliers_mts <- interpret_outliers(outliers,frequency(lfserie),tsphf)
  
  hfserie <- ts(matrix(c(constant,hfserie,outliers_mts),
                       nrow = NROW(hfserie)),
                start = tsphf[1L],
                frequency = tsphf[3L],
                names = c("constant",
                          if (is.null(colnames(hfserie))) "hfserie" else colnames(hfserie),
                          colnames(outliers_mts)))
  
  if (anyDuplicated(colnames(hfserie))) stop("Invalid colnames for hfserie",
                                             call. = FALSE)
  
  res <- twoStepsBenchmark_impl(hfserie,lfserie,
                                include.differenciation,include.rho,
                                c(set.const,set.coeff),
                                start.coeff.calc,end.coeff.calc,
                                start.benchmark,end.benchmark,
                                start.domain,end.domain,maincl,...)
  if (!is.null(outliers)) {
    attr(res$model.list,"outliers") <- outliers
    attr(res$regression$model.list,"outliers") <- outliers
  }
  
  res
}

#' @export
annualBenchmark <- function(hfserie,lfserie,
                            include.differenciation=FALSE,include.rho=FALSE,
                            set.coeff=NULL,set.const=NULL,
                            start.coeff.calc=start(lfserie)[1L],
                            end.coeff.calc=end(lfserie)[1L],
                            start.benchmark=start(lfserie)[1L],
                            end.benchmark=end.coeff.calc[1L]+1L,
                            start.domain=start(hfserie),
                            end.domain=c(end.benchmark[1L]+2L,frequency(hfserie)),
                            outliers=NULL) {
  
  if (frequency(lfserie) != 1) stop("Not an annual time series", call. = FALSE)
  twoStepsBenchmark(hfserie,lfserie,
                    include.differenciation,include.rho,
                    set.coeff,set.const,
                    start.coeff.calc,end.coeff.calc,
                    start.benchmark,end.benchmark,
                    start.domain,end.domain,outliers,
                    cl=match.call())
}

#' Using an estimated benchmark model on another time series
#' 
#' This function reapplies the coefficients and parameters of a benchmark on new
#' time series.
#'
#' `reUseBenchmark` is primarily meant to be used on a series that is derived
#' from the previous one, after some modifications that would bias the
#' estimation otherwise. Working-day adjustment is a good example. Hence, by
#' default, the smoothed part of the first model isn't reevaluated ; the
#' aggregated benchmarked series isn't equal to the low-frequency series.
#' 
#' @usage
#' reUseBenchmark(hfserie,benchmark,reeval.smoothed.part=FALSE)
#' 
#' @param hfserie the bended time series. If it is a matrix time series, it has to
#' have the same column names than the `hfserie` used for the benchmark.
#' @param benchmark a twoStepsBenchmark object, from which the parameters and
#' coefficients are taken.
#' @param reeval.smoothed.part a boolean of length 1. If `TRUE`, the smoothed
#' part is reevaluated, hence the aggregated benchmarked series is equal to the
#' low-frequency series.
#' @return `reUseBenchmark` returns an object of class \link{twoStepsBenchmark}.
#' @examples 
#' benchmark <- twoStepsBenchmark(turnover,construction) 
#' turnover_modif <- turnover
#' turnover_modif[2] <- turnover[2]+2
#' benchmark2 <- reUseBenchmark(turnover_modif,benchmark)
#' @export
reUseBenchmark <- function(hfserie,benchmark,reeval.smoothed.part=FALSE) {
  m <- model.list(benchmark)
  
  coefs <- coef(benchmark)
  set.const <- coefs["constant"]
  set.coeff <- coefs[names(coefs) != "constant"]
  
  twoStepsBenchmark(hfserie,m$lfserie,
                    m$include.differenciation,m$include.rho,
                    set.coeff,set.const,
                    m$start.coeff.calc,m$end.coeff.calc,
                    m$start.benchmark,m$end.benchmark,
                    m$start.domain,m$end.domain,
                    outliers = outliers(benchmark),cl=match.call(),
                    set.smoothed.part = if (!reeval.smoothed.part) smoothed.part(benchmark))
}
InseeFr/disaggR documentation built on June 29, 2024, 7:10 p.m.