# Copyright 2019 Steven L. Scott. All rights reserved.
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
# ===========================================================================
# This is a multivariate version of the univariate bsts function. For now it
# only supports Gaussian observations. In the future the plan is to expand to
# the same set of reward distributions supported by bsts. This function may
# actually merge with bsts in the future. It is quite similar.
#
# Expected usage 1:
# Matrix y, Matrix x
# ss <- AddSharedLocalLevel(list(), y)
# model <- mbsts(y ~ x, state.specification = ss, niter = 1000)
#
# Expected usage 2:
# Data frame with columns for time stamp and series identifier.
#
# There are two data formats to support. Wide and tall. If you want regressors
# then you'll need tall. Tall data requires an "id" and a "timestamp" variable.
# The tall format is more general, but might be less convenient sometimes.
#
# With the tall format it becomes clear that there are two kinds of regressors.
# Those specific to a time point, and those specific to a series at a time
# point. The second form can be obtained from the first by repeating the
# predictors.
#
# There should be a role for shrinkage across series in choosing the prior.
# That comes later.
#
# Args:
# formula: Either a formula as one would supply to lm(), or a matrix of data.
# If a matrix, rows represent time, and columns represent time series. If a
# matrix is supplied then set data.format to "wide".
# shared.state.specification: A list of SharedStateSpecification objects
# defining the components of state that are shared across multiple series.
# series.state.specification: A list of StateSpecification objects defining
# the series-specific components of state. There are two options for how
# this argument can be formatted.
# - It can be a single list with StateSpecification objects as elements. In
# this case each object will be used to define the series-specific state
# for each time series in the response.
# - It can be a list with length matching the number of time series to be
# modeled. Each list element is a list of StateSpecification objects
# defining the series-specific state for the corresponding time series.
# This option is less convenient, but allows greater control.
# data: An optional data frame containing the data referenced in 'formula.'
# timestamps: A vector of timestamps indicating the time of each observation.
# If "wide" data are used this argument is optional, as the timestamps will
# be inferred from the rows of the data matrix.
# series.id: A factor-like object indicating the time series to which each
# observation belongs. This is only necessary for data in "long" format.
# prior: The prior distribution over any regression coefficients or other
# parameters of the observatino distribution.
# contrasts: an optional list containing the names of contrast functions to
# use when converting factors numeric variables in a regression formula.
# This argument works exactly as it does in 'lm'. The names of the list
# elements correspond to factor variables in your model formula. The list
# elements themselves are the names of contrast functions (see
# help(contrast) and the 'contrasts.arg' argument to
# 'model.matrix.default'). This argument is only used if a model formula is
# specified. It can usually be ignored even then.
# na.action: What to do about missing values. The default is to allow
# missing responses, but no missing predictors. Set this to na.omit or
# na.exclude if you want to omit missing responses altogether, but do so
# with care, as that will remove observations from the time series.
# niter: a positive integer giving the desired number of MCMC draws
# ping: A scalar. If ping > 0 then the program will print a status message to
# the screen every 'ping' MCMC iterations.
# data.format: Indicates whether the data are in wide or long form. Wide data
# will be converted to long format using Bsts::WideToLong.
# seed: The seed for the C++ random number generator.
# ...: Extra arguments are passed to DefaultMbstsPrior().
mbsts <- function(formula,
shared.state.specification,
series.state.specification = NULL,
data = NULL,
timestamps = NULL,
series.id = NULL,
prior = NULL, # TODO
opts = NULL,
contrasts = NULL,
na.action = na.pass,
niter,
ping = niter / 10,
data.format = c("long", "wide"),
seed = NULL,
...) {
data.format <- match.arg(data.format)
check.nonnegative.scalar(niter)
check.scalar.integer(ping)
stopifnot(is.null(seed) || length(seed) == 1)
if (!is.null(seed)) {
seed <- as.integer(seed)
}
# The first step is to properly format the data. We need an object called
# data.list that contains the following:
# - predictors: a matrix of predictors
# - response: a vector of responses
# - series: a factor indicating the time series each element of 'response'
# belongs to.
# - timestamp.info: A list created by TimestampInfo
#
# The coming if/then block is longer than you'd like, but it needs to take
# place here so the 'model.matrix' black magic can work.
has.regression <- is.language(formula)
if (has.regression) {
function.call <- match.call()
my.model.frame <- match.call(expand.dots = FALSE)
frame.match <- match(c("formula", "data", "na.action"),
names(my.model.frame), 0L)
my.model.frame <- my.model.frame[c(1L, frame.match)]
my.model.frame$drop.unused.levels <- TRUE
# In an ordinary regression model the default action for NA's is to delete
# them. This makes sense in ordinary regression models, but is dangerous in
# time series, because it artificially shortens the time between two data
# points. If the user has not specified an na.action function argument then
# we should use na.pass as a default, so that NA's are passed through to the
# underlying C++ code.
if (! "na.action" %in% names(my.model.frame)) {
my.model.frame$na.action <- na.pass
}
my.model.frame[[1L]] <- as.name("model.frame")
my.model.frame <- eval(my.model.frame, parent.frame())
model.terms <- attr(my.model.frame, "terms")
predictors <- model.matrix(model.terms, my.model.frame, contrasts)
if (any(is.na(predictors))) {
stop("bsts does not allow NA's in the predictors, only the responses.")
}
response <- model.response(my.model.frame, "any")
if (data.format == "wide") {
response.frame <- WideToLong(response)
if (is.null(timestamps)) {
timestamps <- response.frame$time
} else {
timestamps <- timestamps[response.frame$time]
}
series.id <- response.frame$series.id
expanded.predictors <- predictors[series.id, ]
response <- response.frame$values
}
stopifnot(length(response) == nrow(predictors))
} else {
## If there is no regression component the data could be in either long or
## wide format. Make sure it is long.
response <- formula
if (data.format == "wide") {
# Handle a data frames like a matrix.
if (is.data.frame(response) && all(sapply(response, is.numeric))) {
response <- as.matrix(response)
## TODO: Ensure we don't lose zoo timestamps here.
}
response.frame <- WideToLong(response)
timestamps <- response.frame$time
series.id <- response.frame$series
response <- response.frame$values
}
if (is.data.frame(response) || is.matrix(response)) {
stop("Please change data.format to 'wide' if passing matrix-valued time series")
}
predictors <- matrix(1, nrow = length(response), ncol = 1)
colnames(predictors) <- "Intercept"
}
series.id <- as.factor(series.id)
nseries <- length(levels(series.id))
data.list <- list(
predictors = predictors,
response = response,
series.id = series.id,
timestamp.info = TimestampInfo(predictors, NULL, timestamps)
)
#------------------------------------------------------------------------
# Check the format of the state specification.
#------------------------------------------------------------------------
spec <- .CheckMbstsStateSpecification(shared.state.specification,
series.state.specification, nseries)
shared.state.specification <- spec[[1]]
series.state.specification <- spec[[2]]
#------------------------------------------------------------------------
# Ensure the prior has the proper format.
#------------------------------------------------------------------------
if (is.null(prior)) {
prior <- .DefaultMbstsPrior(predictors, response, series.id)
}
# The prior for the observation model is a list of spike adn slab priors.
stopifnot(is.list(prior), length(prior) == nseries,
all(sapply(prior, inherits, "SpikeSlabPrior")))
#------------------------------------------------------------------------
# Check that options is either NULL or a list. A bit of faith is needed that
# the list is formatted correctly.
# ------------------------------------------------------------------------
if (!is.null(opts)) {
stopifnot(is.list(opts))
}
#------------------------------------------------------------------------
# Check that all the scalars are actually scalars.
#------------------------------------------------------------------------
if (!is.null(seed)) {
seed <- as.integer(seed)
stopifnot(length(seed) == 1)
}
check.scalar.integer(niter)
check.scalar.integer(ping)
#------------------------------------------------------------------------
# Do the work!
#------------------------------------------------------------------------
ans <- .Call("analysis_common_r_fit_multivariate_bsts_model_",
data.list,
shared.state.specification,
series.state.specification,
prior,
opts,
as.integer(niter),
as.integer(ping),
seed)
ans$has.regression <- has.regression
ans$shared.state.specification <- shared.state.specification
ans$series.state.specification <- series.state.specification
ans$prior <- prior
ans$niter <- niter
ans$timestamp.info <- data.list$timestamp.info
ans$series.id <- series.id
ans$original.series <- response
ans$predictors <- predictors
#------------------------------------------------------------------------
# Final formatting.
#------------------------------------------------------------------------
# Set dimnames.
series.names <- as.character(levels(series.id))
predictor.names <- colnames(predictors)
state.model.names <- sapply(shared.state.specification,
function(x) x$name)
ans$nseries <- length(series.names)
dimnames(ans$regression.coefficients) <- list(
NULL, series.names, predictor.names)
dimnames(ans$shared.state.contributions) <- list(
NULL, state.model.names, series.names, NULL)
class(ans) <- "mbsts"
return(ans)
}
.DefaultMbstsPrior <- function(predictors, response, series) {
## Set a default prior on each of the regression models in the mbsts
## observation equation.
ans <- list()
for (s in sort(unique(series))) {
index <- series == s
data.list <- list(predictors = predictors[index, , drop = FALSE],
response = response[index])
ans[[s]] <- .SetDefaultPrior(data.list, family = "gaussian")
}
return(ans)
}
.CheckMbstsStateSpecification <- function(shared.state.specification,
series.state.specification,
nseries) {
# Check that the shared- and series-specific state specifictions are filled
# with legal values.
#
# Args:
# shared.state.specification: A list of SharedStateSpecification objects
# defining the components of state that are shared across multiple series.
# series.state.specification: A list of StateSpecification objects defining
# the series-specific components of state. There are two options for how
# this argument can be formatted.
# - It can be a single list with StateSpecification objects as elements. In
# this case each object will be used to define the series-specific state
# for each time series in the response.
# - It can be a list with length matching the number of time series to be
# modeled. Each list element is a list of StateSpecification objects
# defining the series-specific state for the corresponding time series.
# This option is less convenient, but allows greater control.
# nseries: The number of time series.
#
# Returns:
# A 2-item list containing (1) the shared.state.specification, and (2) the
# series.state.specification, possibly after expanding it.
stopifnot(is.list(shared.state.specification),
all(sapply(shared.state.specification, inherits, "SharedStateModel")))
# The series.state.specification might be an empty list.
stopifnot(is.null(series.state.specification)
|| is.list(series.state.specification))
# If series state specification is not NULL it is either a list of state
# specificiations to be repeated for each time series, or it is a list of such
# specifications.
if (!is.null(series.state.specification) &&
all(sapply(series.state.specification, inherits, "StateModel"))) {
series.state.specification <- RepList(series.state.specification, nseries)
}
if (!is.null(series.state.specification)) {
stopifnot(is.list(series.state.specification),
length(series.state.specification) == nseries)
for (i in 1:nseries) {
stopifnot(is.null(series.state.specification[[i]]) ||
is.list(series.state.specification[[i]]),
all(sapply(series.state.specification[[i]], inherits, "StateModel")))
}
}
return(list(shared.state.specification, series.state.specification))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.