R/TempDisaggDGP.R
In DisaggregateTS: High-Dimensional Temporal Disaggregation

Documented in TempDisaggDGP

#' High and Low-Frequency Data Generating Processes
#'
#' This function generates a high-frequency response vector \eqn{y}, following the relationship \eqn{y = X\beta + \epsilon}, where \eqn{X} is a matrix of indicator series and \eqn{\beta} is a potentially sparse coefficient vector. The low-frequency vector \eqn{Y} is generated by aggregating \eqn{y} according to a specified aggregation method.
#'
#' The aggregation ratio (`aggRatio`) determines the ratio between the low and high-frequency series (e.g., `aggRatio = 4` for annual-to-quarterly). If the number of observations \eqn{n} exceeds \eqn{aggRatio \times n_l}, the aggregation matrix will include zero columns for the extrapolated values.
#'
#' The function supports several data generating processes (DGP) for the residuals, including 'Denton', 'Denton-Cholette', 'Chow-Lin', 'Fernandez', and 'Litterman'. These methods differ in how they generate the high-frequency data and residuals, with optional autocorrelation specified by `rho`.
#'
#' @param n_l 	   Integer. Size of the low-frequency series.
#' @param n  	   Integer. Size of the high-frequency series.
#' @param aggRatio Integer. Aggregation ratio between low and high frequency (default is 4).
#' @param p        Integer. Number of high-frequency indicator series to include.
#' @param beta  	 Numeric. Value for the positive and negative elements of the coefficient vector.
#' @param sparsity Numeric. Sparsity percentage of the coefficient vector (value between 0 and 1).
#' @param method 	 Character. The DGP of residuals to use ('Denton', 'Denton-Cholette', 'Chow-Lin', 'Fernandez', 'Litterman').
#' @param aggMat 	 Character. Aggregation matrix type ('first', 'sum', 'average', 'last').
#' @param rho		   Numeric. Residual autocorrelation coefficient (default is 0).
#' @param mean_X 	 Numeric. Mean of the design matrix (default is 0).
#' @param sd_X 		 Numeric. Standard deviation of the design matrix (default is 1).
#' @param sd_e		 Numeric. Standard deviation of the errors (default is 1).
#' @param simul    Logical. If `TRUE`, the design matrix and the coefficient vector are fixed (default is `FALSE`).
#' @param sparse_option Character or Integer. Option to specify sparsity in the coefficient vector ('random' or integer value). Default is "random".
#' @param setSeed	 Integer. Seed value for reproducibility when `simul` is set to `TRUE`.
#' @return A list containing the following components:
#' \itemize{
#'   \item \code{y_Gen}: Generated high-frequency response series (an \eqn{n \times 1} matrix).
#'   \item \code{Y_Gen}: Generated low-frequency response series (an \eqn{n_l \times 1} matrix).
#'   \item \code{X_Gen}: Generated high-frequency indicator series (an \eqn{n \times p} matrix).
#'   \item \code{Beta_Gen}: Generated coefficient vector (a \eqn{p \times 1} matrix).
#'   \item \code{e_Gen}: Generated high-frequency residual series (an \eqn{n \times 1} matrix).
#' }
#' @keywords DGP sparse high-frequency low-frequency
#' @import zoo withr
#' @export
#' @examples
#' data <- TempDisaggDGP(n_l=25,n=100,p=10,rho=0.5)
#' X <- data$X_Gen
#' Y <- data$Y_Gen
#' @importFrom Rdpack reprompt
#' @importFrom stats lm rbinom rnorm

TempDisaggDGP <- function(n_l, n, aggRatio = 4, p = 1, beta = 1, sparsity = 1, method = 'Chow-Lin', aggMat = 'sum', rho = 0, mean_X = 0, sd_X = 1, sd_e = 1, simul = FALSE, sparse_option = "random", setSeed = 42){

  # Check if rho is valid
  if(rho >= 1 || rho <= -1) {
    stop("For the Chow-Lin method 'rho' must be between -1 and 1.")
  }

  # Generate random vector of coefficients for the DGP
  if(simul == TRUE){
    w <- with_seed(setSeed, matrix(data = rbinom(n = p, size = 1, prob = 0.5), ncol = 1))
  } else {
    w <- matrix(data = rbinom(n = p, size = 1, prob = 0.5), ncol = 1)
  }

  beta <- w * beta - (1 - w) * beta

  # Handle sparsity
  if(sparse_option == "random") {
    if(sparsity != 1) {
      if(sparsity > 1) {
        stop("The 'sparsity' input can only take values in (0,1].")
      } else {
        s <- round(sparsity * p)
        if(simul == TRUE){
          toReplace <- with_seed(setSeed, sample(p, size = s))
        } else {
          toReplace <- sample(p, size = s)
        }
        beta <- replace(beta, list = toReplace, values = 0)
      }
    }
  } else if(is.numeric(sparse_option) && sparse_option == as.integer(sparse_option)) {
    if(sparse_option > p) {
      stop("The 'sparse_option' integer input should be less than or equal to 'p'.")
    } else {
      toReplace <- (length(beta) - sparse_option + 1):length(beta)
      beta <- replace(beta, list = toReplace, values = 0)
    }
  } else {
    stop("If 'sparse_option' is not 'random', it must be an integer.")
  }

  # Generate X matrix based on the method
  if(method == 'Denton-Cholette'){
    if(p > 1){
      stop("For the Denton-Cholette method, p must be 1.")
    }
    if(simul == TRUE){
      X <- with_seed(setSeed, matrix(data = rnorm(n, mean = mean_X, sd = sd_X), ncol = 1))
    } else {
      X <- matrix(data = rnorm(n, mean = mean_X, sd = sd_X), ncol = 1)
    }
  } else {
    if(simul == TRUE){
      X <- with_seed(setSeed, matrix(data = rnorm(n * p, mean = mean_X, sd = sd_X), ncol = p, nrow = n))
    } else {
      X <- matrix(data = rnorm(n * p, mean = mean_X, sd = sd_X), ncol = p, nrow = n)
    }
  }

  # Generate residuals and high-frequency series based on the method
  if(method == 'Denton-Cholette'){
    e <- matrix(data = rnorm(n, mean = 0, sd = sd_e), ncol = 1)
    y <- matrix(data = (X + e), ncol = 1)
    beta <- 1
  } else if(method == 'Chow-Lin'){
    e <- matrix(data = 0, nrow = n)
    e[1] <- rnorm(1, mean = 0, sd = 1)/sqrt(1 - rho^2)
    for(i in 2:nrow(e)){
      u <- rnorm(1, mean = 0, sd = 1)
      e[i] <- rho * e[i-1] + u
    }
    y <- matrix(data = (X %*% beta + e), ncol = 1)
  } else if(method == 'Fernandez' || method == 'Litterman'){
    if(method == 'Fernandez'){
      rho <- 0
    }
    e <- matrix(data = 0, nrow = n)
    u <- matrix(data = 0, nrow = n)
    e[1] <- rnorm(1, mean = 0, sd = 1)
    u[1] <- rnorm(1, mean = 0, sd = 1)/sqrt(1 - rho^2)
    for(i in 2:nrow(e)){
      nu <- rnorm(1, mean = 0, sd = 1)
      u[i] <- rho * u[i-1] + nu
      e[i] <- e[i-1] + u[i]
    }
    y <- matrix(data = (X %*% beta + e), ncol = 1)
  }

  # Check the number of full observations and generate the aggregation matrix
  nfull <- aggRatio * n_l
  extr <- n - nfull

  if(nfull > n) {
    stop("X does not have enough observations.")
  }

  if(aggMat == 'sum'){
    C <- kronecker(diag(n_l), matrix(data = 1, nrow = 1, ncol = aggRatio))
    C <- cbind(C, matrix(0L, n_l, extr))
  } else if(aggMat == 'average'){
    C <- kronecker(diag(n_l), matrix(data = 1/aggRatio, nrow = 1, ncol = aggRatio))
    C <- cbind(C, matrix(0L, n_l, extr))
  } else if(aggMat == 'first'){
    C <- kronecker(diag(n_l), matrix(data = c(1, rep(0, times = aggRatio - 1)), nrow = 1, ncol = aggRatio))
    C <- cbind(C, matrix(0L, n_l, extr))
  } else if(aggMat == 'last'){
    C <- kronecker(diag(n_l), matrix(data = c(rep(0, times = aggRatio - 1), 1), nrow = 1, ncol = aggRatio))
    C <- cbind(C, matrix(0L, n_l, extr))
  }

  Y <- C %*% y

  # Return the generated data
  data_list <- list(y, Y, X, beta, e)
  names(data_list) <- c("y_Gen", "Y_Gen", "X_Gen", "Beta_Gen", "e_Gen")

  return(data_list)
}