R/tsmooth.R
In smoots: Nonparametric Estimation of the Trend and Its Derivatives in TS

Documented in tsmooth

#' Advanced Data-driven Nonparametric Regression for the Trend in Equidistant
#' Time Series
#'
#' This function runs an iterative plug-in algorithm to find the optimal
#' bandwidth for the estimation of the nonparametric trend in equidistant
#' time series (with short-memory errors) and then employs the resulting
#' bandwidth via either local polynomial or kernel regression. This function
#' allows for more flexibility in its arguments than \emph{msmooth}.
#'
#'@param y a numeric vector that contains the time series ordered from past to
#'present.
#'@param p an integer \code{1} (local linear regression) or \code{3} (local
#'cubic regression); represents the order of polynomial within the local
#'polynomial regression (see also the 'Details' section); is set to \code{1} by
#'default; is automatically set to \code{1} if \code{method = "kr"}.
#'@param mu an integer \code{0}, ..., \code{3} that represents the smoothness
#'parameter of the kernel weighting function and thus defines the kernel
#'function that will be used within the local polynomial regression; is set to
#'\code{1} by default.
#'
#'\tabular{cl}{
#'\strong{Number} \tab \strong{Kernel}\cr
#'\code{0} \tab Uniform Kernel\cr
#'\code{1} \tab Epanechnikov Kernel\cr
#'\code{2} \tab Bisquare Kernel\cr
#'\code{3} \tab Triweight Kernel
#'}
#'@param Mcf  method for estimating the variance factor \eqn{c_f} by the IPI
#'(see also the 'Details' section); is set to \code{NP} by default. \cr
#'\tabular{cl}{
#'\strong{Method} \tab \strong{Explanation}\cr
#'\code{"NP"} \tab Nonparametric estimation using the Bartlett window \cr
#'\code{"ARMA"} \tab Estimation on the assumption that the residuals follow an
#'ARMA model\cr
#'\code{"AR"} \tab Estimation on the assumption that the residuals follow an
#'AR model\cr
#'\code{"MA"} \tab Estimation on the assumption that the residuals follow an
#'MA model
#'}
#'@param InfR a character object that represents the inflation
#'rate in the form \eqn{h_d = h^a} for the bandwidth in the estimation of
#'\eqn{I[m^{(k)}]}{I[m^(k)]} (see also the 'Details' section); is set to
#'\code{"Opt"} by default.
#'
#'\tabular{cl}{
#'\strong{Inflation rate} \tab \strong{Description}\cr
#'\code{"Opt"} \tab Optimal inflation rate \eqn{a_{p,O}}{a_[p,O]} (\eqn{5/7}
#'for \code{p = 1}; \eqn{9/11} for \code{p = 3})\cr
#'\code{"Nai"} \tab Naive inflation rate \eqn{a_{p,N}}{a_[p,N]} (\eqn{5/9} for
#'\code{p = 1}; \eqn{9/13} for \code{p = 3})\cr
#'\code{"Var"} \tab Stable inflation rate \eqn{a_{p,S}}{a_[p,S]} (\eqn{1/2} for
#'\code{p = 1} and \code{p = 3})
#'}
#'@param bStart a numeric object that indicates the starting value of the
#'bandwidth for the iterative process; should be \eqn{> 0}; is set to
#'\code{0.15} by default.
#'@param bvc a character object that indicates whether an enlarged bandwidth is
#'being used for the estimation of the variance factor \eqn{c_f} (see also the
#''Details' section) or not; is set to \code{"Y"} by default.
#'
#'\tabular{cl}{
#'\strong{Bandwidth} \tab \strong{Description}\cr
#'\emph{"Y"} \tab Using an enlarged bandwidth\cr
#'\emph{"N"} \tab Using a bandwidth without enlargement
#'}
#'@param bb can be set to \code{0} or \code{1}; the parameter controlling the
#'bandwidth used at the boundary; is set to \code{1} by default.
#'
#'\tabular{cl}{
#'\strong{Number (\code{bb})} \tab \strong{Estimation procedure at boundary
#'points}\cr
#'\code{0} \tab Fixed bandwidth on one side with possible large
#'bandwidth on the other side at the boundary\cr
#'\code{1} \tab The \eqn{k}-nearest neighbor method will be used
#'}
#'@param cb a numeric value that indicates the percentage of omitted
#'observations on each side of the observation period for the automated
#'bandwidth selection; is set to \code{0.05} by default.
#'@param method the final smoothing approach; \code{"lpr"} represents the local
#'polynomial regression, whereas \code{"kr"} implements a kernel regression;
#'is set to \code{"lpr"} by default.
#'
#'@details
#'
#'The trend is estimated based on the additive
#'nonparametric regression model for an equidistant time series
#'\deqn{y_t = m(x_t) + \epsilon_t,}
#'where \eqn{y_t} is the observed time series, \eqn{x_t} is the rescaled time
#'on the interval \eqn{[0, 1]}, \eqn{m(x_t)} is a smooth and deterministic
#'trend function and \eqn{\epsilon_t} are stationary errors with
#'\eqn{E(\epsilon_t) = 0} and short-range dependence (see also Beran and Feng,
#'2002). With this function \eqn{m(x_t)} can be estimated without a parametric
#'model assumption for the error series. Thus, after estimating and removing
#'the trend, any suitable parametric model, e.g. an ARMA(\eqn{p,q}) model, can
#'be fitted to the residuals (see \code{\link[stats]{arima}}).
#'
#'The iterative-plug-in (IPI) algorithm, which numerically minimizes the
#'Asymptotic Mean Squared Error (AMISE), was proposed by Feng, Gries
#'and Fritz (2020).
#'
#'Define \eqn{I[m^{(k)}] = \int_{c_b}^{d_b} [m^{(k)}(x)]^2 dx}{I[m^(k)] =
#'int_[c_b]^[d_b] \{m^(k)\}^2 dx}, \eqn{\beta_{(\nu, k)} = \int_{-1}^{1} u^k
#'K_{(\nu, k)}(u) du}{\beta_(\nu, k) = int_[-1]^[1] u^k K_(\nu, k)(u) du}
#'and \eqn{R(K) = \int_{-1}^{1} K_{(\nu, k)}^{2}(u) du}{R(K) = int_[-1]^[1]
#'\{K_(\nu, k)\}^2 du}, where \eqn{p} is the order of the polynomial,
#'\eqn{k = p + 1} is the order of the asymptotically equivalent kernel,
#'\eqn{\nu} is the order of the trend function's derivative, \eqn{0 \leq c_{b}
#'< d_{b} \leq 1}{0 \le c_b < d_b \le 1}, \eqn{c_f} is the variance factor and
#'\eqn{K_{(\nu, k)}(u)}{K_(\nu, k)(u)} the \eqn{k}-th order equivalent kernel
#'obtained for the estimation of \eqn{m^{(\nu)}}{m^(\nu)} in the interior.
#'\eqn{m^{(\nu)}}{m^(\nu)} is the \eqn{\nu}-th order derivative (\eqn{\nu = 0,
#'1, 2, ...}) of the nonparametric trend.
#'
#'Furthermore, we define
#'\deqn{C_{1} = \frac{I[m^{(k)}] \beta_{(\nu, k)}^2}{(k!)^2}}{C_1 =
#'[I[m^(k)]\{\beta_(\nu, k)\}^2] / (k!)^2}
#'and
#'\deqn{C_{2} = \frac{2 \pi c_{f} (d_b - c_b) R(K)}{nh^{2 \nu + 1}}}{C_2 =
#'2\pi(d_b - c_b)R(K)c_f / (nh^[2\nu + 1])}
#'with \eqn{h} being the bandwidth and \eqn{n} being the number of
#'observations. The AMISE is then
#'\deqn{AMISE(h) = h^{2(k-\nu)}C_{1} + C_{2}.}{AMISE(h) = h^[2(k - \nu)] C_1 +
#'C_2.}
#'
#'The function calculates suitable estimates for \eqn{c_f}, the variance
#'factor, and \eqn{I[m^{(k)}]}{I[m^(k)]} over different iterations. In each
#'iteration, a bandwidth is obtained in accordance with the AMISE that once
#'more serves as an input for the following iteration. The process repeats
#'until either convergence or the 40th iteration is reached. For further
#'details on the asymptotic theory or the algorithm, please consult Feng, Gries
#'and Fritz (2020) or Feng et al. (2019).
#'
#'To apply the function, more arguments are needed compared to the similar
#'function \code{\link{msmooth}}: a data input \code{y}, an order of polynomial
#'\code{p}, a kernel weighting function defined by the smoothness parameter
#'\code{mu}, a variance factor estimation method \code{Mcf}, an inflation rate
#'setting \code{InfR} (see also Beran and Feng, 2002), a starting value for the
#'relative bandwidth \code{bStart}, an inflation setting for the variance
#'factor estimation \code{bvc}, a boundary method \code{bb}, a boundary cut-off
#'percentage \code{cb} and a final smoothing method \code{method}.
#'In fact, aside from the input vector \code{y}, every argument has a default
#'setting that can be adjusted for the individual case. Theoretically, the
#'initial bandwidth does not affect the selected optimal bandwidth. However, in
#'practice local minima of the AMISE might exist and influence the selected
#'bandwidth. Therefore, the default setting is \code{bStart = 0.15}, which is a
#'compromise between the starting values \code{bStart = 0.1} for \code{p = 1}
#'and \code{bStart = 0.2} for \code{p = 3} that were proposed by Feng, Gries
#'and Fritz (2020). In the rare case of a clearly unsuitable optimal bandwidth,
#'a starting bandwidth that differs from the default value is a first
#'possible approach to obtain a better result. Other argument adjustments can
#'be tried as well. For more specific information on the input arguments
#'consult the section \emph{Arguments}.
#'
#'When applying the function, an optimal bandwidth is obtained based on the
#'IPI algorithm proposed by Feng, Gries and Fritz (2020). In a second step,
#'the nonparametric trend of the series is calculated with respect
#'to the chosen bandwidth and the selected regression method (\code{lpf} or
#'\code{kr}). Please note that \code{method = "lpf"} is strongly recommended by
#'the authors. Moreover, it is notable that \code{p} is automatically set to
#'\code{1} for \code{method = "kr"}. The output object is then a list that
#'contains, among other components, the original time series, the estimated
#'trend values and the series without the trend.
#'
#'The default print method for this function delivers only key numbers such as
#'the iteration steps and the generated optimal bandwidth rounded to the fourth
#'decimal. The exact numbers and results such as the estimated nonparametric
#'trend series are saved within the output object and can be addressed via the
#'\code{$} sign.
#'
#'NOTE:
#'
#'With package version 1.1.0, this function implements C++ code by means
#'of the \code{\link[Rcpp:Rcpp-package]{Rcpp}} and
#'\code{\link[RcppArmadillo:RcppArmadillo-package]{RcppArmadillo}} packages for
#'better performance.
#'
#'@return The function returns a list with different components:
#'
#'\describe{
#'\item{AR.BIC}{the Bayesian Information Criterion of the optimal AR(\eqn{p})
#'model when estimating the variance factor via autoregressive models
#'(if calculated; calculated for \code{alg = "OA"} and \code{alg = "NA"}).}
#'\item{ARMA.BIC}{the Bayesian Information Criterion of the optimal
#'ARMA(\eqn{p,q}) model when estimating the variance factor via
#'autoregressive-moving-average models (if calculated; calculated for
#'\code{alg = "OAM"} and \code{alg = "NAM"}).}
#'\item{cb}{the percentage of omitted observations on each side of the
#'observation period.}
#'\item{b0}{the optimal bandwidth chosen by the IPI-algorithm.}
#'\item{bb}{the boundary bandwidth method used within the IPI; always equal to
#'1.}
#'\item{bStart}{the starting value of the (relative) bandwidth; input
#'argument.}
#'\item{bvc}{indicates whether an enlarged bandwidth was used for the variance
#'factor estimation or not; depends on the chosen algorithm.}
#'\item{cf0}{the estimated variance factor; in contrast to the definitions
#'given in the \emph{Details} section, this object actually contains an
#'estimated value of \eqn{2\pi c_f}, i.e. it corresponds to the estimated sum
#'of autocovariances.}
#'\item{cf0.AR}{the estimated variance factor obtained by estimation of
#'autoregressive models (if calculated; \code{alg = "OA"} or \code{"NA"}).}
#'\item{cf0.ARMA}{the estimated variance factor obtained by estimation of
#'autoregressive-moving-average models (if calculated; calculated for
#'\code{alg = "OAM"} and \code{alg = "NAM"}).}
#'\item{cf0.LW}{the estimated variance factor obtained by Lag-Window Spectral
#'Density Estimation following Bühlmann (1996) (if calculated; calculated for
#'algorithms \code{"A"}, \code{"B"}, \code{"O"} and \code{"N"}).}
#'\item{cf0.MA}{the estimated variance factor obtained by estimation of
#'moving-average models (if calculated; calculated for \code{alg = "OM"} and
#'\code{alg = "NM"}).}
#'\item{I2}{the estimated value of \eqn{I[m^{(k)}]}{I[m^(k)]}.}
#'\item{InfR}{the setting for the inflation rate according to the chosen
#'algorithm.}
#'\item{iterations}{the bandwidths of the single iterations steps}
#'\item{L0.opt}{the optimal bandwidth for the lag-window spectral density
#'estimation (if calculated; calculated for algorithms \code{"A"}, \code{"B"},
#'\code{"O"} and \code{"N"}).}
#'\item{MA.BIC}{the Bayesian Information Criterion of the optimal MA(\eqn{q})
#'model when estimating the variance factor via moving-average models (if
#'calculated; calculated for \code{alg = "OM"} and \code{alg = "NM"}).}
#'\item{Mcf}{the estimation method for the variance factor estimation; depends
#'on the chosen algorithm.}
#'\item{mu}{the smoothness parameter of the second order kernel; input
#'argument.}
#'\item{n}{the number of observations.}
#'\item{niterations}{the total number of iterations until convergence.}
#'\item{orig}{the original input series; input argument.}
#'\item{p.BIC}{the order p of the optimal AR(\eqn{p}) or ARMA(\eqn{p,q}) model
#'when estimating the variance factor via autoregressive or
#'autoregressive-moving average models (if calculated; calculated for
#'\code{alg = "OA"}, \code{alg = "NA"}, \code{alg = "OAM"} and
#'\code{alg = "NAM"}).}
#'\item{p}{the order of polynomial used in the IPI-algorithm; also used for the
#'final smoothing, if \code{method = "lpr"}; input argument.}
#'\item{q.BIC}{the order \eqn{q} of the optimal MA(\eqn{q}) or ARMA(\eqn{p,q})
#'model when estimating the variance factor via moving-average or
#'autoregressive-moving average models (if calculated; calculated for
#'\code{alg = "OM"},
#'\code{alg = "NM"}, \code{alg = "OAM"} and \code{alg = "NAM"}).}
#'\item{res}{the estimated residual series.}
#'\item{v}{the considered order of derivative of the trend; is always zero for
#'this function.}
#'\item{ws}{the weighting system matrix used within the local polynomial
#'regression; this matrix is a condensed version of a complete weighting system
#'matrix; in each row of \code{ws}, the weights for conducting the smoothing
#'procedure at a specific observation time point can be found; the first
#'\eqn{[nb + 0.5]} rows, where \eqn{n} corresponds to the number of
#'observations, \eqn{b} is the bandwidth considered for smoothing and
#'\eqn{[.]} denotes the integer part, contain the weights at the
#'\eqn{[nb + 0.5]} left-hand boundary points; the weights in row
#'\eqn{[nb + 0.5] + 1} are representative for the estimation at all
#'interior points and the remaining rows contain the weights for the right-hand
#'boundary points; each row has exactly \eqn{2[nb + 0.5] + 1} elements,
#'more specifically the weights for observations of the nearest
#'\eqn{2[nb + 0.5] + 1} time points; moreover, the weights are normalized,
#'i.e. the weights are obtained under consideration of the time points
#'\eqn{x_t = t/n}, where \eqn{t = 1, 2, ..., n}.}
#'\item{ye}{the nonparametric estimates of the trend.}
#'}
#'
#'@export
#'
#'@references
#' Beran, J. and Feng, Y. (2002). Local polynomial fitting with long-memory,
#' short-memory and antipersistent errors. Annals of the Institute of
#' Statistical Mathematics, 54(2), 291-311.
#'
#' Bühlmann, P. (1996). Locally adaptive lag-window spectral estimation.
#' Journal of Time Series Analysis, 17(3), 247-270.
#'
#' Feng, Y., Gries, T. and Fritz, M. (2020). Data-driven
#' local polynomial for the trend and its derivatives in economic time
#' series. Journal of Nonparametric Statistics, 32:2, 510-533.
#'
#' Feng, Y., Gries, T., Letmathe, S. and Schulz, D. (2019). The smoots package
#' in R for semiparametric modeling of trend stationary time series. Discussion
#' Paper. Paderborn University. Unpublished.
#'
#'@author
#'\itemize{
#'\item Yuanhua Feng (Department of Economics, Paderborn University), \cr
#'Author of the Algorithms \cr
#'Website: \url{https://wiwi.uni-paderborn.de/en/dep4/feng/}
#'\item Dominik Schulz (Research Assistant) (Department of Economics, Paderborn
#'University), \cr
#'Package Creator and Maintainer
#'}
#'
#'@examples
#'### Example 1: US-GDP ###
#'
#'# Logarithm of test data
#'# -> the logarithm of the data is assumed to follow the additive model
#'test_data <- gdpUS
#'y <- log(test_data$GDP)
#'
#'# Applied tsmooth function for the trend
#'result <- tsmooth(y, p = 1, mu = 1, Mcf = "NP", InfR = "Opt",
#'  bStart = 0.1, bvc = "Y")
#'trend1 <- result$ye
#'
#'# Plot of the results
#'t <- seq(from = 1947, to = 2019.25, by = 0.25)
#'plot(t, y, type = "l", xlab = "Year", ylab = "log(US-GDP)", bty = "n",
#'  lwd = 1, lty = 3,
#'  main = "Estimated trend for log-quarterly US-GDP, Q1 1947 - Q2 2019")
#'points(t, trend1, type = "l", col = "red", lwd = 1)
#'title(sub = expression(italic("Figure 1")), col.sub = "gray47",
#'  cex.sub = 0.6, adj = 0)
#'result
#'
#'\dontrun{
#'### Example 2: German Stock Index ###
#'
#'# The following procedure can be considered, if (log-)returns are assumed
#'# to follow a model from the general class of semiparametric GARCH-type
#'# models (including Semi-GARCH, Semi-Log-GARCH and Semi-APARCH models among
#'# others) with a slowly changing variance over time due to a deterministic,
#'# nonparametric scale function.
#'
#'# Obtain the logarithm of the squared returns
#'returns <- diff(log(dax$Close))   # (log-)returns
#'rt <- returns - mean(returns)     # demeaned (log-)returns
#'yt <- log(rt^2)                   # logarithm of the squared returns
#'
#'# Apply 'smoots' function to the log-data, because the logarithm of
#'# the squared returns follows an additive model with a nonparametric trend
#'# function, if the returns are assumed to follow a semiparametric GARCH-type
#'# model.
#'
#'# In this case, the optimal inflation rate is used for p = 3.
#'est <- tsmooth(yt, p = 3, InfR = "Opt")
#'m_xt <- est$ye                    # estimated trend values
#'
#'# Obtain the standardized returns 'eps' and the scale function 'scale.f'
#'res <- est$res                    # the detrended log-data
#'C <- -log(mean(exp(res)))         # an estimate of a constant value needed
#'                                   # for the retransformation
#'scale.f <- exp((m_xt - C) / 2)    # estimated values of the scale function in
#'                                   # the returns
#'eps <- rt / scale.f               # the estimated standardized returns
#'
#'# -> 'eps' can now be analyzed by any suitable GARCH-type model.
#'#    The total volatilities are then the product of the conditional
#'#    volatilities obtained from 'eps' and the scale function 'scale.f'.
#'}

# The main function------------------------------------------------------------

tsmooth <- function(y, p = c(1, 3), mu = c(0, 1, 2, 3),
                    Mcf = c("NP", "ARMA", "AR", "MA"),
                    InfR = c("Opt", "Nai", "Var"), bStart = 0.15,
                    bvc = c("Y", "N"), bb = c(0, 1), cb = 0.05,
                    method = c("lpr", "kr")) {

     # Input if no inputs were made to specific arguments

     if (length(y) <= 1 || !all(!is.na(y)) || !is.numeric(y)) {
       stop("The argument 'y' must be a numeric vector with length > 1 and ",
            "without NAs.")
     }

     if (!length(p) %in% c(1, 2) || !all(!is.na(p)) || !is.numeric(p)) {
       stop("The argument 'p' must be a single integer value (either 1 or 3).")
     }
     p <- floor(p)

     if (!length(mu) %in% c(1, 4) || !all(!is.na(mu)) || !is.numeric(mu)) {
       stop("The argument 'mu' must be a single integer value (0, 1, 2 or 3).")
     }
     mu <- floor(mu)

     if (!length(Mcf) %in% c(1, 4) || !all(!is.na(Mcf)) ||
       !is.character(Mcf)) {
       stop("The argument 'Mcf' must be a single non-NA character value.")
     }

     if (!length(InfR) %in% c(1, 3) || !all(!is.na(Mcf)) ||
       !is.character(Mcf)) {
       stop("The argument 'InfR' must be a single non-NA character value.")
     }

     if (length(bStart) != 1 || is.na(bStart) || !is.numeric(bStart) ||
         (bStart <= 0)) {
       stop("The argument 'bStart' must be a single non-NA double value with ",
            "bStart > 0.")
     }

     if (!length(bvc) %in% c(1, 2) || !all(!is.na(bvc)) ||
       !is.character(bvc)) {
       stop("The argument 'bvc' must be a single non-NA character value.")
     }

     if (!length(bb) %in% c(1, 2) || !all(!is.na(bb)) || !is.numeric(bb)) {
       stop("The argument 'bb' must be a single non-NA integer (either 0 or ",
         "1).")
     }
     bb <- floor(bb)

     if (length(cb) != 1 || is.na(cb) || !is.numeric(cb) ||
         (cb < 0 || cb >= 0.5)) {
       stop("The argument 'cb' must be a single non-NA double value with ",
            "0 <= cb < 0.5.")
     }

     if (!length(method) %in% c(1, 2) || !all(!is.na(method)) ||
         !is.character(method)) {
       stop("The argument 'method' must be a single non-NA character value.")
     }

     if (all(mu == c(0, 1, 2, 3))) mu <- 1
     if (all(Mcf == c("NP", "ARMA", "AR", "MA"))) Mcf <- "NP"
     if (all(InfR == c("Opt", "Nai", "Var"))) InfR <- "Opt"
     if (all(bvc == c("Y", "N"))) bvc <- "Y"
     if (all(bb == c(0, 1))) bb <- 1
     if (all(method == c("lpr", "kr"))) method <- "lpr"

     if (length(method) != 1 || !(method %in% c("lpr", "kr"))) {
       stop("Input of argument 'method' incorrect. Method not recognized.")
     }

     if (all(p == c(1, 3)) || method == "kr") p <- 1

     # Stop, if incorrect inputs were given
     if (length(p) != 1 || !(p %in% c(1, 3))) {
       stop("Input of argument 'p' incorrect. It must be either 1 or 3.")
     }
     if (length(mu) != 1 || !(mu %in% 0:4)) {
       stop("Input of argument 'mu' incorrect. It must be an integer from 0 ",
         "to 4.")
     }
     if (length(Mcf) != 1 || !(Mcf %in% c("NP", "ARMA", "AR", "MA"))) {
       stop("Input of argument 'Mcf' incorrect. Method not recognized.")
     }
     if (length(InfR) != 1 || !(InfR %in% c("Opt", "Nai", "Var"))) {
       stop("Input of argument 'InfR' incorrect. Input not recognized.")
     }
     if (length(bvc) != 1 || !(bvc %in% c("Y", "N"))) {
       stop("Input of argument 'bvc' incorrect. Input not recognized.")
     }
     if(length(bb) != 1 || !(bb %in% c(0, 1))) {
       stop("Input of argument 'bb' incorrect. It must be either 0 or 1.")
     }

     results <- tsmoothCalc(y = y, p = p, mu = mu, Mcf = Mcf, InfR = InfR,
       bStart = bStart, bvc = bvc, bb = bb, cb = cb, method = method)

     class(results) <- "smoots"
     attr(results, "function") <- "tsmooth"
     results
}
# End of the function