R/modsel.R

Defines functions average_forecast split_data MASE MAPE MSE lf_range_to_hf select_and_forecast find_mls_terms variables_to_formula add_expressions amidas_table expand_amidas print.lws_table expand_weights_lags prepare_model_frame formula_table print.midas_r_ic_table make_ic_table update.midas_r_ic_table midas_r_ic_table weights_table modsel term_info last_term_info lf_lags_table hf_lags_table

Documented in amidas_table average_forecast expand_amidas expand_weights_lags hf_lags_table lf_lags_table midas_r_ic_table modsel select_and_forecast split_data weights_table

if (getRversion() >= "2.15.1") utils::globalVariables("X")

##' Create a high frequency lag selection table for MIDAS regression model
##'
##' Creates a high frequency lag selection table for MIDAS regression model with given information criteria and minimum and maximum lags.
##' @param formula the formula for MIDAS regression, the lag selection is performed for the last MIDAS lag term in the formula
##' @param data a list containing data with mixed frequencies
##' @param start the starting values for optimisation
##' @param from a named list, or named vector with lag numbers which are the beginings of MIDAS lag structures. The names should correspond to the MIDAS lag terms in the formula for which to do the lag selection. Value NA indicates lag start at zero
##' @param to a named list where each element is a vector with two elements. The first element is the lag number from which the lag selection starts, the second is the lag number at which the lag selection ends. NA indicates lowest (highest) lag numbers possible.
##' @param IC the information criteria which to compute
##' @param test the names of statistical tests to perform on restricted model, p-values are reported in the columns of model selection table
##' @param Ofunction see \link{midasr}
##' @param weight_gradients see \link{midas_r}
##' @param ... additional parameters to optimisation function, see \link{midas_r}
##' @return a \code{midas_r_iclagtab} object which is the list with the following elements:
##'
##' \item{table}{the table where each row contains calculated information criteria for both restricted and unrestricted MIDAS regression model with given lag structure}
##' \item{candlist}{the list containing fitted models}
##' \item{IC}{the argument IC}
##' @examples
##'
##' data("USunempr")
##' data("USrealgdp")
##' y <- diff(log(USrealgdp))
##' x <- window(diff(USunempr),start=1949)
##' trend <- 1:length(y)
##'
##' mlr <- hf_lags_table(y ~ trend + fmls(x, 12, 12,nealmon),
##'                      start = list(x=rep(0,3)),
##'                      data = list(y = y, x = x, trend = trend),
##'                      from=c(x=0),to=list(x=c(4,4)))
##' mlr
##'
##' @details This function estimates models sequentially increasing the midas lag from \code{kmin} to \code{kmax} of the last term of the given formula
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
hf_lags_table <- function(formula, data, start, from, to, IC = c("AIC", "BIC"), test = c("hAh_test"), Ofunction = "optim", weight_gradients = NULL, ...) {
  if (!identical(names(from), names(to))) stop("The names of lag structure start and end should be identical")
  from <- as.list(from)
  varnames <- names(from)

  Zenv <- new.env(parent = environment(formula))
  cl <- match.call()
  mf <- match.call(expand.dots = FALSE)
  if (missing(data)) data <- NULL

  prep <- prepare_model_frame(data, Zenv, cl, mf, parent.frame())

  kmax <- round(10 * log(length(prep$y), base = 10))

  lti <- lapply(varnames, function(nm) term_info(prep$mt, nm, prep$Zenv))
  names(lti) <- varnames

  table <- mapply(function(ti, st, end) {
    wstart <- start[ti$varname]
    names(wstart) <- ti$weight
    if (is.na(st)) st <- 0
    nop <- length(wstart[[1]])
    if (is.na(end[1])) {
      end[1] <- st + nop
    }
    else {
      end[1] <- max(end[1], st + nop)
    }
    if (is.na(end[2])) {
      end[2] <- kmax
    }
    else {
      if (end[1] > end[2]) stop("The lag number which ends the selection should be larger or equal to the lag number which starts the selction")
    }
    expand_weights_lags(ti$weight, st, end, 1, wstart)
  }, lti, from, to, SIMPLIFY = FALSE)
  names(table) <- varnames

  start <- start[!(names(start) %in% varnames)]
  if (length(start) == 0) start <- NULL

  midas_r_ic_table(formula, data, start = start, table = table, IC = IC, test = test, Ofunction = Ofunction, weight_gradients = weight_gradients, ...)
}

##' Create a low frequency lag selection table for MIDAS regression model
##'
##' Creates a low frequency lag selection table for MIDAS regression model with given information criteria and minimum and maximum lags.
##' @param formula the formula for MIDAS regression, the lag selection is performed for the last MIDAS lag term in the formula
##' @param data a list containing data with mixed frequencies
##' @param start the starting values for optimisation
##' @param from a named list, or named vector with high frequency (NB!) lag numbers which are the beginnings of MIDAS lag structures. The names should correspond to the MIDAS lag terms in the formula for which to do the lag selection. Value NA indicates lag start at zero
##' @param to a named list where each element is a vector with two elements. The first element is the low frequency lag number from which the lag selection starts, the second is the low frequency lag number at which the lag selection ends. NA indicates lowest (highest) lag numbers possible.
##' @param IC the information criteria which to compute
##' @param test the names of statistical tests to perform on restricted model, p-values are reported in the columns of model selection table
##' @param Ofunction see \link{midasr}
##' @param weight_gradients see \link{midas_r}
##' @param ... additional parameters to optimisation function, see \link{midas_r}
##' @return a \code{midas_r_ic_table} object which is the list with the following elements:
##'
##' \item{table}{the table where each row contains calculated information criteria for both restricted and unrestricted MIDAS regression model with given lag structure}
##' \item{candlist}{the list containing fitted models}
##' \item{IC}{the argument IC}
##' @examples
##'
##' data("USunempr")
##' data("USrealgdp")
##' y <- diff(log(USrealgdp))
##' x <- window(diff(USunempr),start=1949)
##' trend <- 1:length(y)
##'
##' mlr <- lf_lags_table(y~trend+fmls(x,12,12,nealmon),
##'                      start=list(x=rep(0,3)),
##'                      from=c(x=0),to=list(x=c(3,4)))
##' mlr
##'
##' @details This function estimates models sequentially increasing the midas lag from \code{kmin} to \code{kmax} of the last term of the given formula
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
lf_lags_table <- function(formula, data, start, from, to, IC = c("AIC", "BIC"), test = c("hAh_test"), Ofunction = "optim", weight_gradients = NULL, ...) {
  if (!identical(names(from), names(to))) stop("The names of lag structure start and end should be identical")
  from <- as.list(from)
  varnames <- names(from)

  Zenv <- new.env(parent = environment(formula))
  cl <- match.call()
  mf <- match.call(expand.dots = FALSE)
  if (missing(data)) data <- NULL

  prep <- prepare_model_frame(data, Zenv, cl, mf, parent.frame())

  kmax <- round(10 * log(length(prep$y), base = 10))

  lti <- lapply(varnames, function(nm) term_info(prep$mt, nm, prep$Zenv))
  names(lti) <- varnames

  table <- mapply(function(ti, st, end) {
    wstart <- start[ti$varname]
    names(wstart) <- ti$weight
    m <- ti$frequency
    if (is.na(st)) st <- 0
    nop <- length(wstart[[1]])
    if (is.na(end[1])) {
      end[1] <- max((st + nop) %/% m, 1)
    }
    else {
      end[1] <- max(end[1], max((st + nop) %/% m, 1))
    }
    if (is.na(end[2])) {
      end[2] <- kmax %/% m
    }
    else {
      if (end[1] > end[2]) stop("The lag number which ends the selection should be larger or equal to the lag number which starts the selction")
    }
    expand_weights_lags(ti$weight, st, end, m, wstart)
  }, lti, from, to, SIMPLIFY = FALSE)
  names(table) <- varnames

  start <- start[!(names(start) %in% varnames)]
  if (length(start) == 0) start <- NULL

  midas_r_ic_table(formula, data, start = start, table = table, IC = IC, test = test, Ofunction = Ofunction, weight_gradients = weight_gradients, ...)
}

last_term_info <- function(x, Zenv) {
  last.term <- x[[3]]
  if (length(last.term) == 3) {
    last.term <- x[[c(3, 3)]]
    ind <- c(3, 3)
  }
  else {
    ind <- 3
  }
  lags <- as.numeric(eval(last.term[[3]], Zenv))
  freq <- as.numeric(eval(last.term[[4]], Zenv))
  weightname <- as.character(last.term[[5]])
  mtype <- as.character(last.term[[1]])

  if (!(mtype %in% c("fmls", "dmls", "mls"))) stop("The last term in the formula must be a MIDAS lag term")
  if (mtype == "fmls") lags <- 0:lags

  list(lags = lags, weight = weightname, varname = as.character(last.term[[2]]), frequency = freq)
}

term_info <- function(mt, term.name, Zenv) {
  vars <- as.list(attr(mt, "variables"))[-1]
  term.no <- find_mls_terms(term.name, vars)

  if (length(term.no) == 0) stop("No mls terms for variable ", term.name)
  if (length(term.no) > 1) stop("There can be only one mls term for variable ", term.name)

  mls_term <- vars[[term.no]]

  lags <- as.numeric(eval(mls_term[[3]], Zenv))
  freq <- as.numeric(eval(mls_term[[4]], Zenv))
  weightname <- as.character(mls_term[[5]])
  mtype <- as.character(mls_term[[1]])

  if (mtype == "fmls") lags <- 0:lags

  list(lags = lags, weight = weightname, varname = as.character(mls_term[[2]]), frequency = freq)
}


##' Select the model based on given information criteria
##'
##' Selects the model with minimum of given information criteria and model type
##' @param x a \link{midas_r_ic_table} object
##' @param IC the name of information criteria to base the choosing of the model
##' @param test the name of the test for which to print out the p-value
##' @param type the type of MIDAS model, either restricted or unrestricted
##' @param print logical, if TRUE, prints the summary of the best model.
##' @return (invisibly) the best model based on information criteria, \link{midas_r} object
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
##' @examples
##'
##' data("USunempr")
##' data("USrealgdp")
##' y <- diff(log(USrealgdp))
##' x <- window(diff(USunempr),start=1949)
##' trend <- 1:length(y)
##'
##' mhfr <- hf_lags_table(y~trend+fmls(x,12,12,nealmon),
##'                       start=list(x=rep(0,3)),
##'                       from=list(x=0),to=list(x=c(4,6)))
##'
##' mlfr <- lf_lags_table(y~trend+fmls(x,12,12,nealmon),
##'                       start=list(x=rep(0,3)),
##'                       from=list(x=0),to=list(x=c(2,3)))
##'
##' modsel(mhfr,"BIC","unrestricted")
##'
##' modsel(mlfr,"BIC","unrestricted")
##'
##' @details This function selects the model from the model selection table for which the chosen information criteria achieves the smallest value. The function works with model tables produced by functions \link{lf_lags_table}, \link{hf_lags_table}, \link{amidas_table} and \link{midas_r_ic_table}.
modsel <- function(x, IC = x$IC[1], test = x$test[1], type = c("restricted", "unrestricted"), print = TRUE) {
  if (!(IC %in% x$IC)) stop("The supplied information criteria was not used in creating lag selection table")
  type <- match.arg(type)
  coln <- paste(IC, type, sep = ".")
  i <- which.min(x$table[, coln])

  if (print) {
    cat("\n Selected model with ")

    cat(IC, " = ", x$table[i, coln], "")
    cat("\n Based on", type, "MIDAS regression model\n")
    cat(" The p-value for the null hypothesis of the test", test, "is", x$table[i, paste(test, "p.value", sep = ".")], "\n")
    print(summary(x$candlist[[i]]))
  }
  invisible(x$candlist[[i]])
}

##' Create a weight function selection table for MIDAS regression model
##'
##' Creates a weight function selection table for MIDAS regression model with given information criteria and weight functions.
##' @param formula the formula for MIDAS regression, the lag selection is performed for the last MIDAS lag term in the formula
##' @param data a list containing data with mixed frequencies
##' @param start the starting values for optimisation
##' @param IC the information criteria which to compute
##' @param test the names of statistical tests to perform on restricted model, p-values are reported in the columns of model selection table
##' @param Ofunction see \link{midasr}
##' @param weight_gradients see \link{midas_r}
##' @param ... additional parameters to optimisation function, see \link{midas_r}
##' @return a \code{midas_r_ic_table} object which is the list with the following elements:
##'
##' \item{table}{the table where each row contains calculated information criteria for both restricted and unrestricted MIDAS regression model with given lag structure}
##' \item{candlist}{the list containing fitted models}
##' \item{IC}{the argument IC}
##' @examples
##'
##' data("USunempr")
##' data("USrealgdp")
##' y <- diff(log(USrealgdp))
##' x <- window(diff(USunempr),start=1949)
##' trend <- 1:length(y)
##' mwr <- weights_table(y~trend+fmls(x,12,12,nealmon),
##'                      start=list(x=list(nealmon=rep(0,3),
##'                      nbeta=c(1,1,1,0))))
##'
##' mwr
##'
##' @details This function estimates models sequentially increasing the midas lag from \code{kmin} to \code{kmax} of the last term of the given formula
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
weights_table <- function(formula, data, start = NULL, IC = c("AIC", "BIC"), test = c("hAh_test"), Ofunction = "optim", weight_gradients = NULL, ...) {
  Zenv <- new.env(parent = environment(formula))
  cl <- match.call()
  mf <- match.call(expand.dots = FALSE)
  if (missing(data)) data <- NULL

  prep <- prepare_model_frame(data, Zenv, cl, mf, parent.frame())

  varnames <- names(start)[sapply(start, is.list)]

  table <- lapply(varnames, function(nm) {
    ti <- term_info(terms(formula), nm, prep$Zenv)
    out <- expand_weights_lags(names(start[[nm]]), from = 1, to = c(1, 1), 0, start = start[[nm]])
    out$lags <- rep(list(ti$lags), length.out = length(out$weights))
    out
  })
  names(table) <- varnames
  start <- start[!(names(start) %in% varnames)]
  if (length(start) == 0) start <- NULL
  midas_r_ic_table(formula, data, start = start, table = table, IC = IC, test = test, Ofunction = Ofunction, weight_gradients = weight_gradients, ...)
}

##' Create a weight and lag selection table for MIDAS regression model
##'
##' Creates a weight and lag selection table for MIDAS regression model with given information criteria and minimum and maximum lags.
##' @param formula the formula for MIDAS regression, the lag selection is performed for the last MIDAS lag term in the formula
##' @param data a list containing data with mixed frequencies
##' @param start the starting values for optimisation excluding the starting values for the last term
##' @param table an wls_table object, see \link{expand_weights_lags}
##' @param IC the names of information criteria which to compute
##' @param test the names of statistical tests to perform on restricted model, p-values are reported in the columns of model selection table
##' @param Ofunction see \link{midasr}
##' @param weight_gradients see \link{midas_r}
##' @param show_progress logical, TRUE to show progress bar, FALSE for silent evaluation
##' @param ... additional parameters to optimisation function, see \link{midas_r}
##' @return a \code{midas_r_ic_table} object which is the list with the following elements:
##'
##' \item{table}{the table where each row contains calculated information criteria for both restricted and unrestricted MIDAS regression model with given lag structure}
##' \item{candlist}{the list containing fitted models}
##' \item{IC}{the argument IC}
##' @examples
##'
##' data("USunempr")
##' data("USrealgdp")
##' y <- diff(log(USrealgdp))
##' x <- window(diff(USunempr),start=1949)
##' trend <- 1:length(y)
##'
##'
##' mwlr <- midas_r_ic_table(y~trend+fmls(x,12,12,nealmon),
##'                    table=list(x=list(weights=
##'                    as.list(c("nealmon","nealmon","nbeta")),
##'                    lags=list(0:4,0:5,0:6),
##'                    starts=list(rep(0,3),rep(0,3,),c(1,1,1,0)))))
##'
##' mwlr
##'
##' @details This function estimates models sequentially increasing the midas lag from \code{kmin} to \code{kmax} and varying the weights of the last term of the given formula
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
midas_r_ic_table <- function(formula, data = NULL, start = NULL, table, IC = c("AIC", "BIC"), test = c("hAh_test"), Ofunction = "optim", weight_gradients = NULL, show_progress = TRUE, ...) {
  Zenv <- new.env(parent = environment(formula))
  formula <- as.formula(formula)
  args <- list(...)
  cl <- match.call()
  mf <- match.call(expand.dots = FALSE)
  mf$formula <- formula

  prep <- prepare_model_frame(data, Zenv, cl, mf, parent.frame())

  Zenv <- prep$Zenv
  mff <- prep$mf
  isstar <- any(sapply(table, function(l) any(names(l[["weights"]]) == "*")))

  ## Remove those formulas for which the number of parameters is less or equal than number of lags.
  remove_incomplete <- function(info, nm) {
    cond <- mapply(
      function(lags, start) {
        ifelse(length(lags) >= length(start), TRUE, FALSE)
      },
      lapply(info$lags, function(x) x[[nm]]),
      lapply(info$starts, function(x) x[[nm]]),
      SIMPLIFY = TRUE
    )
    lapply(info, function(x) x[cond])
  }

  wlinfo <- remove_incomplete(formula_table(prep$mt, names(table)[1], Zenv, table[[1]], start), names(table)[1])

  combine <- function(l) {
    nms <- names(l[[1]])
    out <- lapply(nms, function(nm) do.call("c", lapply(l, function(x) x[[nm]])))
    names(out) <- nms
    out
  }

  if (length(table) > 1) {
    for (i in 2:length(table)) {
      res <- mapply(function(f, s, lg) {
        out <- formula_table(terms(f), names(table)[i], Zenv, table[[i]], s)
        out$lags <- lapply(out$lags, function(ll) c(ll, lg))
        out
      }, wlinfo$formulas, wlinfo$starts, wlinfo$lags, SIMPLIFY = FALSE)

      wlinfo <- combine(res)
      wlinfo <- remove_incomplete(wlinfo, names(table)[i])
    }
  }

  modellist <- mapply(function(f, st) {
    mff[[2L]] <- f
    ### Add condition for catching the star in the table
    if (isstar) {
      itr <- checkARstar(terms(eval(mff[[2]], Zenv)))
      mff[[2]] <- itr$x
    } else {
      itr <- NULL
    }
    mmf <- eval(mff, Zenv)
    mmt <- attr(mmf, "terms")
    y <- model.response(mmf, "numeric")
    X <- model.matrix(mmt, mmf)
    # st <- st[!sapply(st,is.null)] #this is AR*, not working currently
    list(mt = mmt, y = y, X = X, start = st, itr = itr)
  }, wlinfo$formulas, wlinfo$starts, SIMPLIFY = FALSE)

  # maxlag <- which.max(sapply(wlinfo$lags,function(ll)max(sapply(ll,max))))
  maxlag <- which.min(sapply(modellist, with, nrow(X)))
  lrn <- rownames(modellist[[maxlag]]$X)

  modellist <- lapply(modellist, function(mm) {
    rn <- rownames(mm$X)
    ind <- match(lrn, rn)
    mm$y <- mm$y[ind]
    mm$X <- mm$X[ind, ]
    mm
  })

  new_cl <- cl
  m <- match(c("table", "IC", "test", "show_progress"), names(new_cl))
  new_cl[m] <- NULL

  if ("Ofunction" %in% names(new_cl)) new_cl$Ofunction <- eval(new_cl$Ofunction)
  if ("weight_gradients" %in% names(new_cl)) new_cl$weight_gradients <- eval(new_cl$weight_gradients)
  if (is.null(eval(new_cl$data, parent.frame()))) new_cl$data <- NULL

  mrm <- lapply(modellist, function(mm) {
    cll <- new_cl
    cll[[1]] <- as.name("midas_r")
    cll$formula <- formula(mm$mt)
    environment(cll$formula) <- Zenv
    cll$start <- mm$start
    res <- prepmidas_r(mm$y, mm$X, mm$mt, Zenv, cll, args, mm$start, Ofunction, weight_gradients, mm$itr$lagsTable)
    class(res) <- "midas_r"
    res
  })

  if (show_progress) {
    cat("\nModel selection progress:\n")
    pb <- txtProgressBar(min = 0, max = length(mrm), initial = 0, style = 3)
  }
  candlist <- mapply(function(l, i) {
    if (show_progress) setTxtProgressBar(pb, i)
    out <- try(midas_r.fit(l), silent = TRUE)
    out
  }, mrm, 1:length(mrm), SIMPLIFY = FALSE)
  if (show_progress) close(pb)
  success <- sapply(candlist, class)

  if ("try-error" %in% success) {
    for (i in which(success == "try-error")) {
      cat("\n====================================\n")
      cat("The following model did not converge:\n")
      cat(gsub("[ ]+", " ", capture.output(cat(deparse(formula(mrm[[i]]))))))
      cat("\nWith the following error message:\n")
      cat(candlist[[i]])

      cat("--------------------------------------\n")
    }
  }
  candl <- candlist[success != "try-error"]

  make_ic_table(candl, IC, test)
}

##' @method update midas_r_ic_table
##' @export
update.midas_r_ic_table <- function(object, ...) {
  do.call("make_ic_table", object[-1])
}

make_ic_table <- function(candlist, IC, test, ...) {
  makelist <- function(x) {
    if (length(x) == 1) {
      list(x)
    } else {
      as.list(x)
    }
  }
  makefun <- function(l) {
    lapply(l, function(ll) {
      if (is.function(ll)) {
        ll
      } else {
        eval(as.name(ll))
      }
    })
  }
  ICfun <- makefun(makelist(IC))
  tfun <- makefun(makelist(test))

  dtest <- function(mm) {
    ## The error might be given if there are NaN values in the hessian, give NA instead of the error.
    out <- try(deriv_tests(mm), silent = TRUE)
    if (class(out) == "try-error") {
      c(NA, NA)
    } else {
      unlist(out[c("first", "second")])
    }
  }
  tab <- lapply(candlist, function(mm) {
    c(
      sapply(ICfun, function(ic) ic(mm)),
      sapply(ICfun, function(ic) ifelse(is.null(mm$unrestricted), NA, ic(mm$unrestricted))),
      sapply(tfun, function(tt) {
        tst <- try(tt(mm))
        ifelse(class(tst) == "try-error", NA,
          tst$p.value
        )
      }),
      dtest(mm),
      mm$convergence
    )
  })

  tab <- do.call("rbind", tab)

  colnames(tab) <- c(paste(IC, "restricted", sep = "."), paste(IC, "unrestricted", sep = "."), paste(test, "p.value", sep = "."), "First", "Second", "Convergence")

  tab <- data.frame(model = sapply(candlist, function(mod) {
    gsub("[ ]+", " ", capture.output(cat(deparse(formula(mod)))))
  }), tab)
  tab$First <- as.logical(tab$First)
  tab$Second <- as.logical(tab$Second)

  res <- list(table = tab, candlist = candlist, IC = IC, test = test, weights = tab[, 1], lags = tab[, 2])
  class(res) <- "midas_r_ic_table"
  res
}

##' @export
##' @method print midas_r_ic_table
print.midas_r_ic_table <- function(x, ...) {
  print(x$table, ...)
}

formula_table <- function(mt, varname, Zenv, table, start) {
  if (is.null(names(table))) names(table) <- c("weights", "lags", "names")

  vars <- as.list(attr(mt, "variables"))[-1]
  term.no <- find_mls_terms(varname, vars)

  vars[[term.no]][[1]] <- as.name("mls")

  formulas <- vector("list", length(table$lags))
  starts <- formulas
  for (i in 1:length(formulas)) {
    res <- vars
    lt <- res[[term.no]]
    wght <- table$weights[[i]]
    if (is.character(wght)) {
      if (wght == "") {
        lt <- lt[1:4]
      } else {
        if (wght == "*") {
          lt[[5]] <- wght
        } else {
          lt[[5]] <- as.name(wght)
        }
      }
    }
    else {
      if (!is.function(wght)) stop("Supply either function name or a function")
      nmwght <- names(table$weights)[i]
      lt[[5]] <- as.name(nmwght)
      ## A bit of nasty hack. We rely on the fact that environments are special R objects, i.e. they are not replicated when passed to function.
      if (nmwght %in% ls(envir = Zenv)) warning("Name ", nmwght, " is reserved, overwriting with special weight function")
      assign(nmwght, wght, Zenv)
    }
    lt[[3]] <- table$lags[[i]]
    res[[term.no]] <- lt
    formulas[[i]] <- variables_to_formula(res)

    wst <- list(table$starts[[i]])
    names(wst) <- varname
    starts[[i]] <- c(start, wst)
  }

  list(
    formulas = formulas,
    lags = lapply(table$lags, function(l) {
      out <- list(l)
      names(out) <- varname
      out
    }),
    weights = table$weights,
    starts = starts
  )
}

prepare_model_frame <- function(data, Zenv, cl, mf, pf) {
  ## Get the response of the model to get the number of observations
  ## Get the model.frame object, not evaluated!
  ## Prepare data if necessary
  if (is.null(data)) {
    ee <- NULL
  } else {
    ee <- data_to_env(data)
    parent.env(ee) <- pf
  }
  assign("ee", ee, Zenv)

  m <- match(c("formula", "data"), names(mf), 0L)
  mf <- mf[c(1L, m)]
  mf[[1L]] <- as.name("model.frame")
  mf[[3L]] <- as.name("ee")
  mf[[4L]] <- as.name("na.omit")
  names(mf)[c(2, 3, 4)] <- c("formula", "data", "na.action")

  itr <- checkARstar(terms(eval(mf[[2]], Zenv)))

  mff <- mf
  mtf <- eval(mf, Zenv)
  mtt <- attr(mtf, "terms")

  # We need only response to get the number of low frequency observations.
  resf <- eval(mf$formula, Zenv)
  resf[[3]] <- 1
  mf$formula <- resf

  mf <- eval(mf, Zenv)

  y <- model.response(mf, "numeric")
  list(Zenv = Zenv, y = y, mf = mff, itr = itr, mt = mtt)
}

##' Creates table of weights, lags and starting values
##'
##' For each weight function creates lags starting from \code{kmin} to \code{kmax}. This is a convenience function for easier work with the function \link{midas_r_ic_table}.
##' @title Create table of weights, lags and starting values
##' @param weights either a vector with names of the weight functions or a named list of weight functions
##' @param from the high frequency lags from which to start the fitting
##' @param to a vector of length two, containing minimum and maximum lags, high frequency if \code{m=1}, low frequency otherwise.
##' @param m the frequency ratio
##' @param start a named list with the starting values for weight functions
##' @return a \code{lws_table} object, a list with elements \code{weights}, \code{lags} and \code{starts}.
##' @examples
##'
##' expand_weights_lags(c("nealmon","nbeta"),0,c(4,8),1,start=list(nealmon=rep(0,3),nbeta=rep(0,4)))
##' nlmn <- expand_weights_lags("nealmon",0,c(4,8),1,start=list(nealmon=rep(0,3)))
##' nbt <- expand_weights_lags("nbeta",0,c(4,8),1,start=list(nbeta=rep(0,4)))
##'
##' nlmn+nbt
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
##'
expand_weights_lags <- function(weights, from = 0, to, m = 1, start) {
  weights <- as.list(weights)
  kmin <- min(to)
  kmax <- max(to)
  mkmin <- from
  chnm <- sapply(weights, is.character)
  names(weights)[chnm] <- unlist(weights[chnm])
  # if(is.null(names(start)))names(start) <- rep("",length(start))

  # We do not need starts for "" and "*" weights
  wnm <- setdiff(names(weights), c("", "*"))
  if (length(wnm) > 0) {
    # We do need starts for "" and "*"
    if (!identical(wnm, names(start))) stop("Mismatch between the  weight function names and the names of starting values")
  }

  if (m > 1) {
    lags <- lapply(kmin:kmax, function(x) (mkmin):(x * m - 1))
  }
  else {
    lags <- lapply(kmin:kmax, function(x) mkmin:x)
  }

  weights <- rep(weights, each = length(lags))
  starts <- rep(start, each = length(lags))
  lags <- rep(lags, length.out = length(weights))

  normalize_starts <- function(x) {
    inds <- which(names(x$weights) %in% c("*", ""))
    for (i in inds) {
      # We set the starts to zero for weights * and ""
      x$starts[[i]] <- rep(0, length.out = length(x$lags[[i]]))
    }
    x
  }
  ## For "" and "*" weights replicate the starts according to lags
  out <- normalize_starts(list(weights = weights, lags = lags, starts = starts))
  class(out) <- "lws_table"
  out
}

##' @export
##' @method print lws_table
print.lws_table <- function(x, ...) {
  if (is.null(names(x))) names(x) <- c("weights", "lags", "starts")
  p1 <- function(x) capture.output(cat(deparse(x)))
  p2 <- function(x) capture.output(cat(deparse(round(x, 4))))
  print(data.frame(weights = names(x$weights), lags = sapply(x$lags, p1), starts = sapply(x$starts, p2)))
}

##' Create table of weights, lags and starting values for Ghysels weight schema, see \link{amweights}
##'
##' Given weight function creates lags starting from \code{kmin} to \code{kmax} and replicates starting values for each low frequency lag.
##' @title Create table of weights, lags and starting values for Ghysels weight schema
##' @param weight the names of weight functions
##' @param type the type of Ghysels schema, \code{"A"}, \code{"B"} or \code{"C"}
##' @param from the high frequency lags from which to start the fitting
##' @param to to a vector of length two, containing minimum and maximum lags, high frequency if \code{m=1}, low frequency otherwise.
##' @param m the frequency ratio
##' @param start the starting values for the weights of the one low frequency lag
##' @return a \code{lws_table} object, a list with elements \code{weights}, \code{lags} and \code{starts}
##' @examples
##' expand_amidas("nealmon","A",0,c(1,2),12,c(0,0,0))
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
expand_amidas <- function(weight, type = c("A", "B", "C"), from = 0, to, m, start) {
  lags <- lapply(to[1]:to[2], function(x) from:(x * m - 1))
  d <- sapply(lags, length)
  nm <- paste(weight, type, d, sep = "_")
  type <- match.arg(type)
  if (type == "A") {
    starts <- lapply(d %/% m, function(lf) rep(start, times = lf))
  }
  if (type == "B") {
    starts <- lapply(d %/% m, function(lf) c(rep(start[1], lf), start[-1]))
  }
  if (type == "C") {
    starts <- lapply(d %/% m, function(lf) c(start[1], rep(start[-1], times = lf)))
  }
  names(starts) <- nm
  ff <- expression(amweights(p, d, m, weight, type))
  ff[[c(1, 4)]] <- m
  ff[[c(1, 5)]] <- as.name(weight)
  ff[[c(1, 6)]] <- as.character(type)
  weights <- mapply(function(fun, e, dk) {
    e[[c(1, 3)]] <- as.numeric(dk)
    fun[[c(1, 3, 3, 2)]] <- e[[1]]
    eval(fun)
  }, rep(list(expression(f <- function(p, d, m) {
    p
  })), length(d)), rep(list(ff), length(d)), as.list(d), SIMPLIFY = FALSE)
  names(weights) <- nm
  out <- list(weights = weights, lags = lags, starts = starts)
  class(out) <- "lws_table"
  out
}

##' Combines \code{lws_table} objects
##'
##' The \code{lws_table} objects have similar structure to table, i.e. it is a list with 3 elements which are the lists with the same number of elements. The base function \code{c} would \code{cbind} such tables. This function \code{rbind}s them.
##'
##' @title Combine \code{lws_table} objects
##' @param ... \code{lws_table} object
##' @param check logical, if TRUE checks that the each \code{lws_table} object is named a list with names \code{c("weights","lags","starts")}
##' @return \code{lws_table} object
##' @rdname lws_table-add
##' @method + lws_table
##' @examples
##' nlmn <- expand_weights_lags("nealmon",0,c(4,8),1,start=list(nealmon=rep(0,3)))
##' nbt <- expand_weights_lags("nbeta",0,c(4,8),1,start=list(nbeta=rep(0,4)))
##'
##' nlmn+nbt
##' @export
##' @author Virmantas Kvedaras, Vaidotas Zemlys
"+.lws_table" <- function(..., check = TRUE) {
  l <- list(...)
  if (check) {
    nms <- c("weights", "lags", "starts")
    for (i in 1:length(l)) {
      if (is.null(names(l[[i]]))) names(l[[i]]) <- nms
    }
  }
  out <- lapply(nms, function(nm) do.call("c", lapply(l, function(x) x[[nm]])))
  names(out) <- nms
  class(out) <- "lws_table"
  out
}
##' Create weight and lag selection table for the aggregates based MIDAS regression model
##'
##' This function estimates models sequentialy increasing the midas lag from \code{kmin} to \code{kmax} and varying the weights of the last term of the given formula
##' @title Weight and lag selection table for aggregates based MIDAS regression model
##' @param formula the formula for MIDAS regression, the lag selection is performed for the last MIDAS lag term in the formula
##' @param data a list containing data with mixed frequencies
##' @param weights the names of weights used in Ghysels schema
##' @param wstart the starting values for the weights of the firs low frequency lag
##' @param type the type of Ghysels schema see \link{amweights}, can be a vector of types
##' @param start the starting values for optimisation excluding the starting values for the last term
##' @param from a named list, or named vector with high frequency (NB!) lag numbers which are the beginnings of MIDAS lag structures. The names should correspond to the MIDAS lag terms in the formula for which to do the lag selection. Value NA indicates lag start at zero
##' @param to to a named list where each element is a vector with two elements. The first element is the low frequency lag number from which the lag selection starts, the second is the low frequency lag number at which the lag selection ends. NA indicates lowest (highest) lag numbers possible.
##' @param IC the names of information criteria which should be calculated
##' @param test the names of statistical tests to perform on restricted model, p-values are reported in the columns of model selection table
##' @param Ofunction see \link{midasr}
##' @param weight_gradients see \link{midas_r}
##' @param ... additional parameters to optimisation function, see \link{midas_r}
##' @return a \code{midas_r_ic_table} object which is the list with the following elements:
##'
##' \item{table}{the table where each row contains calculated information criteria for both restricted and unrestricted MIDAS regression model with given lag structure}
##' \item{candlist}{the list containing fitted models}
##' \item{IC}{the argument IC}
##' \item{test}{the argument test}
##' \item{weights}{the names of weight functions}
##' \item{lags}{the lags used in models}
##' @examples
##'
##' data("USunempr")
##' data("USrealgdp")
##' y <- diff(log(USrealgdp))
##' x <- window(diff(USunempr),start=1949)
##' trend <- 1:length(y)
##'
##' tb <- amidas_table(y~trend+fmls(x,12,12,nealmon),
##'                    data=list(y=y,x=x,trend=trend),
##'                    weights=c("nealmon"),wstart=list(nealmon=c(0,0,0)),
##'                    start=list(trend=1),type=c("A"),
##'                    from=0,to=c(1,2))
##'
##'
##' @details This function estimates models sequentially increasing the midas lag from \code{kmin} to \code{kmax} and varying the weights of the last term of the given formula
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
amidas_table <- function(formula, data, weights, wstart, type, start = NULL, from, to, IC = c("AIC", "BIC"), test = c("hAh_test"), Ofunction = "optim", weight_gradients = NULL, ...) {
  Zenv <- new.env(parent = environment(formula))
  cl <- match.call()
  mf <- match.call(expand.dots = FALSE)
  if (missing(data)) data <- NULL

  prep <- prepare_model_frame(data, Zenv, cl, mf, parent.frame())

  lti <- last_term_info(formula, prep$Zenv)
  lags <- lti$lags

  m <- lti$frequency

  if (!is.list(weights)) {
    weights <- as.list(weights)
    ### Ugly code fix this
    names(weights) <- sapply(weights, function(x) x[[1]])
  }
  if (!is.list(wstart)) {
    wstart <- list(wstart)
    names(wstart) <- names(weights)
  }


  if (!identical(names(weights), names(wstart))) stop("Mismatch between the  weight function names and the names of starting values")

  if (length(weights) < length(type)) {
    weights <- rep(weights, length.out = length(type))
    wstart <- rep(wstart, length.out = length(type))
  } else {
    if (length(type) < length(weights)) {
      type <- rep(type, length.out = length(weights))
    }
  }

  tb <- mapply(function(w, t, s) {
    expand_amidas(weight = w, type = t, from = from, to = to, m = m, start = s)
  }, as.list(weights), as.list(type), as.list(wstart), SIMPLIFY = FALSE)
  names(tb) <- NULL

  table <- list(do.call("+", tb))
  names(table) <- lti$varname
  midas_r_ic_table(formula, data, start = start, table = table, IC = IC, test = test, Ofunction = Ofunction, weight_gradients = NULL, ...)
}


add_expressions <- function(l) {
  if (!is.list(l)) stop("The summands must be in a list")
  if (length(l) == 1) {
    return(l[[1]])
  }
  else {
    base <- expression(a + b)
    base[[c(1, 2)]] <- l[[1]]
    base[[c(1, 3)]] <- l[[2]]
    if (length(l) > 2) {
      l <- l[-2:-1]
      for (i in 1:length(l)) {
        tmp <- expression(a + b)
        tmp[[c(1, 2)]] <- base[[1]]
        tmp[[c(1, 3)]] <- l[[i]]
        base[[1]] <- tmp[[1]]
      }
    }
    return(base[[1]])
  }
}

variables_to_formula <- function(vars, intercept = 0) {
  rhs <- add_expressions(vars[-1])
  if (intercept == 1) {
    res <- formula(a ~ b - 1)
    res[[2]] <- vars[[1]]
    res[[c(3, 2)]] <- rhs
  }
  else {
    res <- formula(a ~ b)
    res[[2]] <- vars[[1]]
    res[[3]] <- rhs
  }
  res
}

find_mls_terms <- function(term.name, vars) {
  res <- sapply(vars, function(l) {
    if (length(l) > 1) {
      if (as.character(l[[1]]) %in% c("mls", "fmls", "dmls")) {
        ifelse(as.character(l[[2]]) == term.name, TRUE, FALSE)
      }
      else {
        FALSE
      }
    }
    else {
      FALSE
    }
  })
  which(res)
}

##' Creates tables for different forecast horizons and table for combined forecasts
##'
##' Divide data into in-sample and out-of-sample. Fit different forecasting horizons for in-sample data. Calculate accuracy measures for individual and average forecasts.
##'
##' @title Create table for different forecast horizons
##' @param formula initial formula for the
##' @param data list of data
##' @param from a named list of starts of lags from where to fit. Denotes the horizon
##' @param to a named list for lag selections
##' @param insample the low frequency indexes for in-sample data
##' @param outsample the low frequency indexes for out-of-sample data
##' @param weights names of weight function candidates
##' @param wstart starting values for weight functions
##' @param start other starting values
##' @param IC name of information criteria to choose model from
##' @param seltype argument to modsel, \code{"restricted"} for model selection based on information criteria of restricted MIDAS model, \code{"unrestricted"} for model selection based on unrestricted (U-MIDAS) model.
##' @param ftype which type of forecast to use.
##' @param test argument to modsel
##' @param measures the names of goodness of fit measures
##' @param fweights names of weighting schemes
##' @param ... additional arguments for optimisation method, see \link{midas_r}
##' @return  a list containing forecasts, tables of accuracy measures and the list with selected models
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
##' @examples
##' ### Sets a seed for RNG ###
##' set.seed(1001)
##' ## Number of low-frequency observations
##' n<-250
##' ## Linear trend and higher-frequency explanatory variables (e.g. quarterly and monthly)
##' trend<-c(1:n)
##' x<-rnorm(4*n)
##' z<-rnorm(12*n)
##' ## Exponential Almon polynomial constraint-consistent coefficients
##' fn.x <- nealmon(p=c(1,-0.5),d=8)
##' fn.z <- nealmon(p=c(2,0.5,-0.1),d=17)
##' ## Simulated low-frequency series (e.g. yearly)
##' y<-2+0.1*trend+mls(x,0:7,4)%*%fn.x+mls(z,0:16,12)%*%fn.z+rnorm(n)
##' ##Do not run
##' ## cbfc<-select_and_forecast(y~trend+mls(x,0,4)+mls(z,0,12),
##' ## from=list(x=c(4,8,12),z=c(12,24,36)),
##' ## to=list(x=rbind(c(14,19),c(18,23),c(22,27)),z=rbind(c(22,27),c(34,39),c(46,51))),
##' ## insample=1:200,outsample=201:250,
##' ## weights=list(x=c("nealmon","almonp"),z=c("nealmon","almonp")),
##' ## wstart=list(nealmon=rep(1,3),almonp=rep(1,3)),
##' ## IC="AIC",
##' ## seltype="restricted",
##' ## ftype="fixed",
##' ## measures=c("MSE","MAPE","MASE"),
##' ## fweights=c("EW","BICW","MSFE","DMSFE")
##' ## )
##'
select_and_forecast <- function(formula, data, from, to,
                                insample, outsample,
                                weights, wstart, start = NULL,
                                IC = "AIC",
                                seltype = c("restricted", "unrestricted"),
                                test = "hAh_test",
                                ftype = c("fixed", "recursive", "rolling"),
                                measures = c("MSE", "MAPE", "MASE"),
                                fweights = c("EW", "BICW", "MSFE", "DMSFE"),
                                ...) {
  seltype <- match.arg(seltype)
  ftype <- match.arg(ftype)
  if (length(setdiff(fweights, c("EW", "BICW", "MSFE", "DMSFE"))) > 0) {
    stop("Supported weight schemes are EW, BICW, MSFE, DMSFE")
  }

  Zenv <- new.env(parent = environment(formula))
  formula <- as.formula(formula)

  if (missing(data) || is.null(data)) {
    dataenv <- Zenv
  } else {
    dataenv <- data_to_env(data)
  }

  ## Change this, there is a cleaner way
  mt <- terms(formula)
  yname <- all.vars(mt[[2]])
  m <- get_frequency_info(mt, Zenv)
  nms <- names(m)
  fullsample <- lapply(nms, function(nm) eval(as.name(nm), dataenv))
  names(fullsample) <- nms

  nmx <- names(fullsample)
  nmx <- nmx[nmx != yname]

  datasplit <- split_data(fullsample, insample, outsample)
  indata <- datasplit$indata
  outdata <- datasplit$outdata

  wperm <- do.call("expand.grid", weights)
  nperm <- colnames(wperm)

  fhtab <- vector("list", length(from[[1]]))
  for (h in 1:length(from[[1]])) {
    res <- vector("list", nrow(wperm))
    for (i in 1:nrow(wperm)) {
      res[[i]] <- lapply(nperm, function(nm) {
        wname <- as.character(wperm[i, nm])
        expand_weights_lags(wname, from[[nm]][h], to[[nm]][h, ], 1, start = wstart[wname])
      })
      names(res[[i]]) <- nperm
    }
    fhtab[[h]] <- res
  }
  modno <- sapply(fhtab, length)
  nmodno <- sum(modno)
  cat("\nModel selection progress:\n")
  pb <- txtProgressBar(min = 0, max = nmodno, initial = 0, style = 3)

  bestm <- mapply(function(fh, prog) {
    mapply(function(tb, i) {
      out <- modsel(midas_r_ic_table(formula, data = indata, start = start, table = tb, IC = IC, test = test, show_progress = FALSE), IC = IC, type = seltype, print = FALSE, ...)
      setTxtProgressBar(pb, i)
      out
    }, fh, as.list(prog + 1:length(fh)), SIMPLIFY = FALSE)
  },
  fhtab, as.list(c(0, cumsum(modno)[-length(modno)])),
  SIMPLIFY = FALSE
  )
  close(pb)

  avgforc <- lapply(bestm, function(l) average_forecast(l, data = fullsample, insample = insample, outsample = outsample, type = ftype, fweights = fweights, measures = measures))
  tabfh <- do.call("rbind", lapply(avgforc, function(l) l$accuracy$individual))

  tboutc <- lapply(avgforc, function(l) l$accuracy$average)

  tabh <- do.call("rbind", mapply(function(tb, h) {
    data.frame(Horizon = h, tb)
  }, tboutc, 1:length(tboutc), SIMPLIFY = FALSE))

  tabh[order(tabh$Horizon, tabh$Scheme), ]

  list(accuracy = list(individual = tabfh, average = tabh), bestlist = bestm, forecasts = avgforc)
}


lf_range_to_hf <- function(range, m) {
  unlist(lapply(range, function(lf) (lf - 1) * m + 1:m))
}

MSE <- function(o, p) {
  mean((o - p)^2)
}

MAPE <- function(o, p) {
  mean(abs((o - p) / o) * 100)
}

MASE <- function(o, p) {
  mean(abs(o - p) / mean(abs(diff(o))))
}
##' Splits mixed frequency data into in-sample and out-of-sample datasets given the indexes of the low frequency data
##'
##' It is assumed that data is a list containing mixed frequency data. Then given the indexes of the low frequency data the function splits the data into two subsets.
##' @title Split mixed frequency data into in-sample and out-of-sample
##' @param data a list containing mixed frequency data
##' @param insample the low frequency indexes for in-sample data
##' @param outsample the low frequency indexes for out-of-sample data
##' @return a list with elements \code{indata} and \code{outdata} containing respectively in-sample and out-of-sample data sets
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @export
##' @examples
##'
##' #Monthly data
##' x <- 1:24
##' #Quartely data
##' z <- 1:8
##' #Yearly data
##' y <- 1:2
##' split_data(list(y=y,x=x,z=z),insample=1,outsample=2)
split_data <- function(data, insample, outsample) {
  fullsample <- data_to_list(data)
  nr <- sapply(fullsample, length)
  m <- nr %/% min(nr)
  indata <- mapply(function(var, freq) {
    var[lf_range_to_hf(insample, freq)]
  }, fullsample, as.list(m), SIMPLIFY = FALSE)

  outdata <- mapply(function(var, freq) {
    var[lf_range_to_hf(outsample, freq)]
  }, fullsample, as.list(m), SIMPLIFY = FALSE)

  list(indata = indata, outdata = outdata)
}
##' Average MIDAS model forecasts using specified weighting scheme. Produce in-sample and out-of-sample accuracy measures.
##'
##' Given the data, split it to in-sample and out-of-sample data. Then given the list of models, reestimate each model with in-sample data and produce out-of-sample forecast. Given the forecasts average them with the specified weighting scheme. Then calculate the accuracy measures for individual and average forecasts.
##'
##' The forecasts can be produced in 3 ways. The \code{"fixed"} forecast uses model estimated with in-sample data. The \code{"rolling"} forecast reestimates model each time by increasing the in-sample by one low frequency observation and dropping the first low frequency observation. These reestimated models then are used to produce out-of-sample forecasts. The \code{"recursive"} forecast differs from \code{"rolling"} that it does not drop observations from the beginning of data.
##'
##' @title Average forecasts of MIDAS models
##' @param modlist a list of \code{midas_r} objects
##' @param data a list with mixed frequency data
##' @param insample the low frequency indexes for in-sample data
##' @param outsample the low frequency indexes for out-of-sample data
##' @param type a string indicating which type of forecast to use.
##' @param fweights names of weighting schemes
##' @param measures names of accuracy measures
##' @param show_progress logical, TRUE to show progress bar, FALSE for silent evaluation
##' @return a list containing forecasts and tables of accuracy measures
##' @author Virmantas Kvedaras, Vaidotas Zemlys
##' @importFrom utils setTxtProgressBar txtProgressBar capture.output
##' @export
##' @examples
##' set.seed(1001)
##' ## Number of low-frequency observations
##' n<-250
##' ## Linear trend and higher-frequency explanatory variables (e.g. quarterly and monthly)
##' trend<-c(1:n)
##' x<-rnorm(4*n)
##' z<-rnorm(12*n)
##' ## Exponential Almon polynomial constraint-consistent coefficients
##' fn.x <- nealmon(p=c(1,-0.5),d=8)
##' fn.z <- nealmon(p=c(2,0.5,-0.1),d=17)
##' ## Simulated low-frequency series (e.g. yearly)
##' y<-2+0.1*trend+mls(x,0:7,4)%*%fn.x+mls(z,0:16,12)%*%fn.z+rnorm(n)
##' mod1 <- midas_r(y ~ trend + mls(x, 4:14, 4, nealmon) + mls(z, 12:22, 12, nealmon),
##'                 start=list(x=c(10,1,-0.1),z=c(2,-0.1)))
##' mod2 <- midas_r(y ~ trend + mls(x, 4:20, 4, nealmon) + mls(z, 12:25, 12, nealmon),
##'                 start=list(x=c(10,1,-0.1),z=c(2,-0.1)))
##'
##' ##Calculate average forecasts
##' avgf <- average_forecast(list(mod1,mod2),
##'                         data=list(y=y,x=x,z=z,trend=trend),
##'                         insample=1:200,outsample=201:250,
##'                         type="fixed",
##'                         measures=c("MSE","MAPE","MASE"),
##'                         fweights=c("EW","BICW","MSFE","DMSFE"))
average_forecast <- function(modlist,
                             data, insample, outsample,
                             type = c("fixed", "recursive", "rolling"),
                             fweights = c("EW", "BICW", "MSFE", "DMSFE"),
                             measures = c("MSE", "MAPE", "MASE"),
                             show_progress = TRUE) {

  # if(length(modlist)==1)stop("Need more than 1 model to produce average forecasts")
  if (missing(data)) stop("Data need to be supplied for forecasting")

  type <- match.arg(type)

  last_in <- length(insample)
  if (insample[last_in] > outsample[1]) stop("The in-sample and out-of-sample indexes should not overlap")
  if (outsample[1] - insample[last_in] != 1) stop("There should be no gaps between in-sample and out-of-sample indexes")

  datasplit <- split_data(data, insample, outsample)

  indata <- datasplit$indata
  outdata <- datasplit$outdata

  ## Check that the response variables of the modlist are the same
  ynames <- sapply(modlist, function(mod) all.vars(terms(mod)[[2]]))
  yname <- unique(unlist(ynames))
  if (length(yname) > 1) stop("The response variable must be the same for all the models")

  outy <- outdata[[yname]]

  reeval <- function(candlist, redata) {
    lapply(candlist, update, data = redata)
  }

  bestm <- reeval(modlist, indata)

  inf <- lapply(bestm, function(mod) cbind(mod$model[, 1], fitted(mod)))

  if (type == "fixed") {
    outf <- lapply(bestm, function(mod) {
      cbind(outdata[[yname]], point_forecast.midas_r(mod, newdata = outdata, method = "static"))
    })
  }
  else {
    if (type %in% c("rolling")) {
      fulls <- c(insample, outsample)
    }
    outm <- matrix(NA, nrow = length(outsample), ncol = length(modlist))
    if (show_progress) {
      cat("\nDoing", type, "forecast :\n")
      pb <- txtProgressBar(min = 0, max = length(outsample), initial = 0, style = 3)
    }

    for (i in 1:length(outsample)) {
      newout <- outsample[i]
      if (i > 1) {
        if (type == "recursive") {
          newin <- c(insample, outsample[1:(i - 1)])
        } else {
          newin <- fulls[1:last_in + i - 1]
        }
      }
      else {
        newin <- insample
      }
      splitnew <- split_data(data, newin, newout)
      emod <- reeval(modlist, splitnew$indata)
      outm[i, ] <- sapply(emod, point_forecast.midas_r, newdata = splitnew$outdata, method = "static")
      if (show_progress) setTxtProgressBar(pb, i)
    }
    if (length(modlist) > 1) {
      outf <- lapply(data.frame(outm), function(x) cbind(outdata[[yname]], x))
    }
    else {
      outf <- list(cbind(outdata[[yname]], outm[, 1]))
    }
    if (show_progress) close(pb)
  }

  msrfun <- lapply(measures, function(msr) eval(as.name(msr)))

  calcmsr <- function(ll) {
    sapply(msrfun, function(msr) {
      sapply(ll, function(a) msr(a[, 1], a[, 2]))
    })
  }
  outstat <- calcmsr(outf)
  instat <- calcmsr(inf)
  if (is.null(dim(outstat))) outstat <- matrix(outstat, nrow = 1)
  if (is.null(dim(instat))) instat <- matrix(instat, nrow = 1)

  colnames(outstat) <- paste0(measures, ".out-of-sample")
  colnames(instat) <- paste0(measures, ".in-sample")
  modi <- sapply(bestm, function(mod) gsub("[ ]+", " ", capture.output(cat(deparse(formula(mod))))))

  tabfh <- data.frame(Model = modi, outstat, instat)
  rownames(tabfh) <- NULL
  EW <- function(hh) {
    n <- length(hh)
    rep(1 / n, n)
  }
  BICW <- function(hh) {
    bicm <- sapply(hh, BIC)
    bicm <- bicm - min(bicm)
    ebic <- exp(-bicm)
    sebic <- sum(ebic)
    if (sebic == 0) {
      EW(hh)
    } else {
      ebic / sebic
    }
  }
  MSFEd <- function(hh, delta) {
    mi <- sapply(hh, function(xx) {
      sum((xx[, 1] - xx[, 2])^2 * delta^(length(outy):1 - 1))
    })
    imi <- 1 / mi
    imi / sum(imi)
  }
  MSFE <- function(hh) MSFEd(hh, 1)
  DMSFE <- function(hh) MSFEd(hh, 0.9)


  w1 <- EW(bestm)
  w2 <- BICW(modlist)
  w3 <- MSFE(outf)
  w4 <- DMSFE(outf)

  fc <- sapply(outf, function(m) m[, 2])
  if (is.null(dim(fc))) fc <- matrix(fc, nrow = 1)

  allwlist <- list(w1, w2, w3, w4)
  names(allwlist) <- c("EW", "BICW", "MSFE", "DMSFE")
  wlist <- allwlist[fweights]
  outc <- lapply(wlist, function(ww) cbind(outf[[1]][, 1], apply(fc, 1, function(r) sum(r * ww))))
  names(outc) <- fweights

  tboutc <- calcmsr(outc)
  colnames(tboutc) <- measures

  tabh <- data.frame(Scheme = fweights, tboutc)
  rownames(tabh) <- NULL

  list(
    forecast = fc,
    avgforecast = sapply(outc, function(m) m[, 2]),
    accuracy = list(
      individual = tabfh,
      average = tabh
    ),
    type = type,
    x = indata[[yname]],
    xout = outdata[[yname]]
  )
}
mpiktas/midasr documentation built on Aug. 24, 2022, 2:32 p.m.