In boennecd/rollRegres: Fast Rolling and Expanding Window Linear Regression

options(width = 80)
knitr::opts_chunk$set(collapse = TRUE, comment = "#R", error = FALSE)

This vignette shows comparisons in terms of computation time with other packages. Specifically, we will make comparisons with the roll package and zoo package. It should be stressed though that the other solutions do additional things than this package does. E.g., there is not performed any rank test in the function in this package. We start by showing the comparisons of computation times and then we show different options. Some function definitions are shown at the end.

Comparisons

#####
# simple R version
roll_regress_R_for_loop <- function(X, y, width){
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)

  for(i in width:n){
    idx <- (i - width + 1L):i
    out[i, ] <- lm.fit(X[idx, , drop = FALSE], y[idx])$coefficients
  }

  out
}

#####
# zoo version
.zoo_inner <- function(Z) {
  fit <- lm.fit(Z[, -1, drop = FALSE], Z[, 1])
  fit$coefficients
}
library(zoo)
roll_regress_zoo <- function(x, y, width){
  Z <- cbind(y, X)
  rollapply(Z, width, FUN = .zoo_inner,  
            by.column = FALSE,  align = "right", fill = NA_real_)
}

#####
# roll_lm
library(roll)
roll_lm_func <- function(x, y ,width)
  roll_lm(X, matrix(y, ncol = 1), wdth, intercept = FALSE)$coefficients

# use one thread as other methods are easily to compute in parallel too
RcppParallel::setThreadOptions(numThreads = 1)

# compile functions
library(compiler)
roll_regress_R_for_loop <- cmpfun(roll_regress_R_for_loop)
.zoo_inner              <- cmpfun(.zoo_inner)
roll_regress_zoo        <- cmpfun(roll_regress_zoo)
roll_lm_func            <- cmpfun(roll_lm_func)

We start by simulating the data

set.seed(32981054)
n <- 10000
p <- 6
wdth = 50
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)
df <- data.frame(y, X)
frm <- eval(parse(text = paste0(
  "formula(y ~ -1 + ", paste0("X", 1:p, collapse = " + "), ")")))

Then we check that the functions give the same (the function definitions are at the end of this document)

library(rollRegres)
base_res <- roll_regress_R_for_loop(X, y, wdth)
all.equal(base_res, roll_regres.fit(X, y, wdth)$coefs, 
          check.attributes = FALSE)
all.equal(base_res, roll_regres(frm, df, wdth)$coefs, 
          check.attributes = FALSE)
all.equal(base_res, roll_regress_zoo(X, y, wdth), 
          check.attributes = FALSE)
all.equal(base_res, roll_lm_func(X, y, wdth), 
          check.attributes = FALSE)

and here we compare the computation time

microbenchmark::microbenchmark(
  roll_regress            = roll_regres.fit(X, y, wdth),
  # this will be slower due to call to `model.matrix` and `model.frame`
  roll_regress_df         = roll_regres(frm, df, wdth),
  roll_regress_zoo        = roll_regress_zoo(X, y, wdth),
  roll_regress_R_for_loop = roll_regress_R_for_loop(X, y, wdth),
  roll_lm = roll_lm_func(X, y, wdth),
  times = 5)

# here is the formula used above
frm

Additional features

This section will cover some additional features.

Expanding window

Here are expanding window regressions with additional output

#####
# simulate data
set.seed(65731482)
n <- 100
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)

#####
# use package function
pck_out <- roll_regres.fit(
  X, y, width = 50L, do_downdates = FALSE, 
  do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))

#####
# assign R-version
R_func <- function(X, y, width){
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)
  sigmas             <- rep(NA_real_, n)
  r.squared          <- rep(NA_real_, n)
  one_step_forecasts <- rep(NA_real_, n)

  for(i in width:n){
    idx <- 1:i
    fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
    out[i, ] <- fit$coefficients

    su <- summary(fit)
    sigmas[i] <- su$sigma

    ss1 <- sum((y[idx] - mean(y[idx]))^2)
    ss2 <- sum(fit$residuals^2)
    r.squared[i] <- 1 - ss2 / ss1

    if(i < n){
      next_i <- i + 1L
      one_step_forecasts[next_i] <- fit$coefficients %*% X[next_i, ]
    }
  }

  list(coef = out, sigmas = sigmas, r.squared = r.squared,
       one_step_forecasts = one_step_forecasts)
}

R_out <- R_func(X, y, 50L)

#####
# gives the same
stopifnot(
  isTRUE(all.equal(R_out$coef              , pck_out$coefs)), 
  isTRUE(all.equal(R_out$sigmas            , pck_out$sigmas)), 
  isTRUE(all.equal(R_out$r.squared         , pck_out$r.squared)), 
  isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))

Update in blocks

You can use the grp argument to make updates in blocks. E.g., here is an example with weekly data

#####
# simulate data
set.seed(68799379)
week <- as.integer(gl(25, 7))
head(week[1:10])
n <- length(week)
p <- 2
X <- matrix(rnorm(p * n), n, p)
y <- drop(X %*% runif(p)) + rnorm(n)

#####
# use package function
pck_out <- roll_regres.fit(
  X, y, width = 10L, grp = week, 
  do_compute = c("sigmas", "r.squareds", "1_step_forecasts"))

#####
# assign R-version
R_func <- function(X, y, width, grp){
  grp <- grp + 1L - min(grp)
  u_grp = unique(grp)
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)
  sigmas             <- rep(NA_real_, n)
  r.squared          <- rep(NA_real_, n)
  one_step_forecasts <- rep(NA_real_, n)

  start_val <- min(which(u_grp >= width))
  for(g in u_grp[start_val:length(u_grp)]){
    idx <- which(grp %in% (g - width + 1L):g)
    i <- which(grp == g)
    fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
    out[i, ] <- sapply(fit$coefficients, rep, times = length(i))

    su <- summary(fit)
    sigmas[i] <- su$sigma

    ss1 <- sum((y[idx] - mean(y[idx]))^2)
    ss2 <- sum(fit$residuals^2)
    r.squared[i] <- 1 - ss2 / ss1

    if(g != max(grp)){
      next_g <- grp[min(which(grp > g))]
      next_g <- which(grp == next_g)
      one_step_forecasts[next_g] <- X[next_g, ] %*% fit$coefficients
    }
  }

  list(coef = out, sigmas = sigmas, r.squared = r.squared,
       one_step_forecasts = one_step_forecasts)
}

R_out <- R_func(X, y, 10L, grp = week)

#####
# gives the same
stopifnot(
  isTRUE(all.equal(R_out$coef              , pck_out$coefs)), 
  isTRUE(all.equal(R_out$sigmas            , pck_out$sigmas)), 
  isTRUE(all.equal(R_out$r.squared         , pck_out$r.squared)), 
  isTRUE(all.equal(R_out$one_step_forecasts, pck_out$one_step_forecasts)))

Block with minimum required number of observation

Suppose that we are performing rolling regression on stock data over a yearly window. Further, suppose that there are some gaps in the data where we do not have data and we require at least 6 months of data. This can be done as follows

#####
# simulate data
set.seed(96235555)
n   <- 10L * 12L * 21L # x years w/ 12 months of 21 trading days
month <- (seq_len(n) - 1L) %/% 21L + 1L # group by months
set.seed(29478439)
X <- matrix(rnorm(n * 4L), ncol = 4L)
y <- rnorm(n)

# randomly drop rows
keep <- seq_along(y) %in% sample.int(nrow(X), as.integer(n * .5))
X     <- X    [keep, ]
y     <- y    [keep]
month <- month[keep]

#####
# use package function
pck_out <- roll_regres.fit(
  X, y, width = 12L, grp = month, min_obs = 21L * 6L, do_downdates = TRUE, 
  do_compute = c("sigmas", "r.squareds"))

#####
# assign R-version
R_func <- function(X, y, width, grp, downdate, min_obs){
  grp <- grp + 1L - min(grp)
  u_grp = unique(grp)
  n <- nrow(X)
  p <- ncol(X)
  out <- matrix(NA_real_, n, p)
  sigmas    <- rep(NA_real_, n)
  r.squared <- rep(NA_real_, n)

  start_val <- min(which(u_grp >= width))
  for(g in u_grp[start_val:length(u_grp)]){
    idx <-
      if(downdate)
        which(grp %in% (g - width + 1L):g) else
          which(grp %in% 1:g)
    i <- which(grp == g)
    if(length(idx) < min_obs)
      next
    fit <- lm(y[idx] ~ -1 + X[idx, , drop = FALSE])
    out[i, ] <- sapply(fit$coefficients, rep, times = length(i))

    su <- summary(fit)
    sigmas[i] <- su$sigma

    ss1 <- sum((y[idx] - mean(y[idx]))^2)
    ss2 <- sum(fit$residuals^2)
    r.squared[i] <- 1 - ss2 / ss1
  }

  list(coef = out, sigmas = sigmas, r.squared = r.squared)
}

R_out <- R_func(X, y, width = 12L, downdate = TRUE, grp = month, 
                min_obs = 21L * 6L)

#####
# gives the same
stopifnot(
  isTRUE(all.equal(R_out$coef     , pck_out$coefs)),
  isTRUE(all.equal(R_out$sigmas   , pck_out$sigmas)),
  isTRUE(all.equal(R_out$r.squared, pck_out$r.squared)))