Nothing
#' Set up control for sentiment-based sparse regression modeling
#'
#' @author Samuel Borms, Keven Bluteau
#'
#' @description Sets up control object for linear or nonlinear modeling of a response variable onto a large panel of
#' textual sentiment measures (and potentially other variables). See \code{\link{sento_model}} for details on the
#' estimation and calibration procedure.
#'
#' @param model a \code{character} vector with one of the following: \code{"gaussian"} (linear regression), \code{"binomial"}
#' (binomial logistic regression), or \code{"multinomial"} (multinomial logistic regression).
#' @param type a \code{character} vector indicating which model calibration approach to use. Supports "\code{BIC}",
#' "\code{AIC}" and "\code{Cp}" (Mallows's Cp) as sparse regression adapted information criteria (Tibshirani and Taylor,
#' 2012; Zou, Hastie and Tibshirani, 2007), and "\code{cv}" (cross-validation based on the \code{\link[caret]{train}}
#' function from the \pkg{caret} package). The adapted information criteria are only available for a linear regression.
#' @param do.intercept a \code{logical}, \code{TRUE} by default fits an intercept.
#' @param h an \code{integer} value that shifts the time series to have the desired prediction setup; \code{h = 0} means
#' no change to the input data (nowcasting assuming data is aligned properly), \code{h > 0} shifts the dependent variable by
#' \code{h} periods (i.e., rows) further in time (forecasting), \code{h < 0} shifts the independent variables by \code{h}
#' periods.
#' @param alphas a \code{numeric} vector of the alphas to test for during calibration, between 0 and 1. A value of
#' 0 pertains to Ridge regression, a value of 1 to LASSO regression; values in between are pure elastic net.
#' @param lambdas a \code{numeric} vector of the lambdas to test for during calibration, \eqn{>= 0}.
#' A value of zero means no regularization, thus requires care when the data is fat. By default set to
#' \code{NULL}, such that the lambdas sequence is generated by the \code{\link[glmnet]{glmnet}} function
#' or set to \code{10^seq(2, -2, length.out = 100)} in case of cross-validation.
#' @param trainWindow a positive \code{integer} as the size of the training sample for cross-validation (ignored if
#' \code{type != } "\code{cv}").
#' @param testWindow a positive \code{integer} as the size of the test sample for cross-validation (ignored if \code{type != }
#' "\code{cv}").
#' @param oos a non-negative \code{integer} to indicate the number of periods to skip from the end of the training sample
#' up to the out-of-sample prediction(s). This is either used in the cross-validation based calibration approach
#' (if \code{type = } "\code{cv}"), or for the iterative out-of-sample prediction analysis (if \code{do.iter = TRUE}). For
#' instance, given \eqn{t}, the (first) out-of-sample prediction is computed at \eqn{t +} \code{oos} \eqn{+ 1}.
#' @param do.iter a \code{logical}, \code{TRUE} induces an iterative estimation of models at the given \code{nSample} size and
#' performs the associated out-of-sample prediction exercise through time.
#' @param do.progress a \code{logical}, if \code{TRUE} progress statements are displayed during model calibration.
#' @param nSample a positive \code{integer} as the size of the sample for model estimation at every iteration (ignored if
#' \code{do.iter = FALSE}).
#' @param start a positive \code{integer} to indicate at which point the iteration has to start (ignored if
#' \code{do.iter = FALSE}). For example, given 100 possible iterations, \code{start = 70} leads to model estimations
#' only for the last 31 samples.
#' @param nCore a positive \code{integer} to indicate the number of cores to use for a parallel iterative model
#' estimation (\code{do.iter = TRUE}). We use the \code{\%dopar\%} construct from the \pkg{foreach} package. By default,
#' \code{nCore = 1}, which implies no parallelization. No progress statements are displayed whatsoever when \code{nCore > 1}.
#' For cross-validation models, parallelization can also be carried out for a single-shot model (\code{do.iter = FALSE}),
#' whenever a parallel backend is set up. See the examples in \code{\link{sento_model}}.
#' @param do.difference a \code{logical}, \code{TRUE} will difference the target variable \code{y} supplied in the
#' \code{\link{sento_model}} function with as lag the absolute value of the \code{h} argument, but
#' \code{abs(h) > 0} is required. For example, if \code{h = 2}, and assuming the \code{y} variable is properly aligned
#' date-wise with the explanatory variables denoted by \eqn{X} (the sentiment measures and other in \code{x}), the regression
#' will be of \eqn{y_{t + 2} - y_t} on \eqn{X_t}. If \code{h = -2}, the regression fitted is \eqn{y_{t + 2} - y_t} on
#' \eqn{X_{t+2}}. The argument is always kept at \code{FALSE} if the \code{model} argument is one of
#' \code{c("binomial", "multinomial")}.
#' @param do.shrinkage.x a \code{logical} vector to indicate which of the other regressors provided through the \code{x}
#' argument of the \code{\link{sento_model}} function should be subject to shrinkage (\code{TRUE}). If argument is of
#' length one, it applies to all external regressors.
#'
#' @return A \code{list} encapsulating the control parameters.
#'
#' @seealso \code{\link{sento_model}}
#'
#' @references Tibshirani and Taylor (2012). \strong{Degrees of freedom in LASSO problems}.
#' \emph{The Annals of Statistics 40, 1198-1232}, \doi{10.1214/12-AOS1003}.
#' @references Zou, Hastie and Tibshirani (2007). \strong{On the degrees of freedom of the LASSO}.
#' \emph{The Annals of Statistics 35, 2173-2192}, \doi{10.1214/009053607000000127}.
#'
#' @examples
#' # information criterion based model control functions
#' ctrIC1 <- ctr_model(model = "gaussian", type = "BIC", do.iter = FALSE, h = 0,
#' alphas = seq(0, 1, by = 0.10))
#' ctrIC2 <- ctr_model(model = "gaussian", type = "AIC", do.iter = TRUE, h = 4, nSample = 100,
#' do.difference = TRUE, oos = 3)
#'
#' # cross-validation based model control functions
#' ctrCV1 <- ctr_model(model = "gaussian", type = "cv", do.iter = FALSE, h = 0,
#' trainWindow = 250, testWindow = 4, oos = 0, do.progress = TRUE)
#' ctrCV2 <- ctr_model(model = "binomial", type = "cv", h = 0, trainWindow = 250,
#' testWindow = 4, oos = 0, do.progress = TRUE)
#' ctrCV3 <- ctr_model(model = "multinomial", type = "cv", h = 2, trainWindow = 250,
#' testWindow = 4, oos = 2, do.progress = TRUE)
#' ctrCV4 <- ctr_model(model = "gaussian", type = "cv", do.iter = TRUE, h = 0, trainWindow = 45,
#' testWindow = 4, oos = 0, nSample = 70, do.progress = TRUE)
#'
#' @export
ctr_model <- function(model = c("gaussian", "binomial", "multinomial"), type = c("BIC", "AIC", "Cp", "cv"),
do.intercept = TRUE, do.iter = FALSE, h = 0, oos = 0, do.difference = FALSE,
alphas = seq(0, 1, by = 0.20), lambdas = NULL, nSample = NULL, trainWindow = NULL,
testWindow = NULL, start = 1, do.shrinkage.x = FALSE, do.progress = TRUE, nCore = 1) {
if (length(model) > 1) model <- model[1]
if (length(type) > 1) type <- type[1]
err <- NULL
if (!(model %in% c("gaussian", "binomial", "multinomial"))) {
err <- c(err, "Provide a proper modeling type under 'model'.")
}
if (!(type %in% c("BIC", "AIC", "Cp", "cv"))) {
err <- c(err, "Provide a proper calibration type under 'type'.")
}
if (length(do.intercept) != 1 || !is.logical(do.intercept)) {
err <- c(err, "The 'do.intercept' argument should be a logical of size one.")
}
if (!is.logical(do.shrinkage.x)) {
err <- c(err, "The 'do.shrinkage.x' argument should be of type logical.")
}
if (model %in% c("binomial", "multinomial") && type != "cv") {
err <- c(err, "Information criteria are currently only supported for linear models, opt for cross-validation instead.")
}
if (oos < 0) {
err <- c(err, "Make sure 'oos' is positive.")
}
if (min(alphas) < 0 || max(alphas) > 1) {
err <- c(err, "Each alpha value in 'alphas' must be between 0 and 1, inclusive.")
}
if (ifelse(is.null(lambdas), 0, min(lambdas)) < 0) {
err <- c(err, "Each lambda value in 'lambdas' must be at least 0.")
}
if (do.iter == FALSE) nSample <- start <- NULL
if (do.iter == TRUE && is.null(nSample)) {
err <- c(err, "Iterative modeling requires a non-NULL sample size given by 'nSample'.")
}
if (do.iter == TRUE && !is.null(nSample)) {
if (nSample <= 0) {
err <- c(err, "Make sure 'nSample' is positive.")
}
if (start <= 0) {
err <- c(err, "Make sure 'start' is non-negative.")
}
}
if (type == "cv" && (is.null(trainWindow) || is.null(testWindow))) {
err <- c(err, "Cross-validation requires a non-NULL training and test window size given by 'trainWindow' and 'testWindow'.")
}
if (!is.null(trainWindow)) {
if (trainWindow <= 0) {
err <- c(err, "Make sure 'trainWindow' is positive.")
}
}
if (!is.null(testWindow)) {
if (testWindow <= 0) {
err <- c(err, "Make sure 'testWindow' is positive.")
}
}
if (!is.null(nSample) && !is.null(trainWindow) && !is.null(testWindow)) {
if ((trainWindow + oos + testWindow) >= nSample) {
err <- c(err, "('trainWindow' + 'oos' + 'testWindow') >= 'nSample'. Adjust windows selection accordingly.")
}
}
if (length(nCore) != 1 || !is.numeric(nCore)) {
err <- c(err, "The 'nCore' argument should be a numeric vector of size one.")
} else nCore <- check_nCore(nCore)
if (model %in% c("binomial", "multinomial")) do.difference <- FALSE
if (length(do.difference) != 1 || !is.logical(do.difference)) {
err <- c(err, "The 'do.difference' argument should be a logical of size one.")
}
if (do.difference == TRUE && abs(h) == 0) {
err <- c(err, "If the 'do.difference' argument is TRUE, the absolute value of 'h' should be positive.")
}
if (!is.null(err)) stop("Wrong inputs. See below for specifics. \n", paste0(err, collapse = "\n"))
ctr_model <- list(model = model,
type = type,
do.intercept = do.intercept,
do.iter = do.iter,
h = h,
oos = oos,
do.difference = do.difference,
do.shrinkage.x = do.shrinkage.x,
nSample = nSample,
start = start,
alphas = unique(alphas),
lambdas = unique(lambdas),
trainWindow = trainWindow,
testWindow = testWindow,
do.progress = do.progress,
nCore = nCore)
return(ctr_model)
}
#' Optimized and automated sentiment-based sparse regression
#'
#' @author Samuel Borms, Keven Bluteau
#'
#' @description Linear or nonlinear penalized regression of any dependent variable on the wide number of sentiment measures and
#' potentially other explanatory variables. Either performs a regression given the provided variables at once, or computes
#' regressions sequentially for a given sample size over a longer time horizon, with associated prediction performance metrics.
#'
#' @details Models are computed using the elastic net regularization as implemented in the \pkg{glmnet} package, to account for
#' the multidimensionality of the sentiment measures. Independent variables are normalized in the regression process, but
#' coefficients are returned in their original space. For a helpful introduction to \pkg{glmnet}, we refer to their
#' \href{https://web.stanford.edu/~hastie/glmnet/glmnet_alpha.html#lin}{vignette}. The optimal elastic net parameters
#' \code{lambda} and \code{alpha} are calibrated either through a to specify information criterion or through
#' cross-validation (based on the "rolling forecasting origin" principle, using the \code{\link[caret]{train}} function).
#' In the latter case, the training metric is automatically set to \code{"RMSE"} for a linear model and to \code{"Accuracy"}
#' for a logistic model. We suppress many of the details that can be supplied to the \code{\link[glmnet]{glmnet}} and
#' \code{\link[caret]{train}} functions we rely on, for the sake of user-friendliness.
#'
#' @param sento_measures a \code{sento_measures} object created using \code{\link{sento_measures}}.
#' @param y a one-column \code{data.frame} or a \code{numeric} vector capturing the dependent (response) variable. In case of
#' a logistic regression, the response variable is either a \code{factor} or a \code{matrix} with the factors represented by
#' the columns as binary indicators, with the second factor level or column as the reference class in case of a binomial
#' regression. No \code{NA} values are allowed.
#' @param x a named \code{data.table}, \code{data.frame} or \code{matrix} with other explanatory variables as \code{numeric}, by
#' default set to \code{NULL}.
#' @param ctr output from a \code{\link{ctr_model}} call.
#'
#' @return If \code{ctr$do.iter = FALSE}, a \code{sento_model} object which is a \code{list} containing:
#' \item{reg}{optimized regression, i.e., a model-specific \pkg{glmnet} object, including for example the estimated
#' coefficients.}
#' \item{model}{the input argument \code{ctr$model}, to indicate the type of model estimated.}
#' \item{alpha}{calibrated alpha.}
#' \item{lambda}{calibrated lambda.}
#' \item{trained}{output from \code{\link[caret]{train}} call (if \code{ctr$type =} "\code{cv}"). There is no such
#' output if the control parameters \code{alphas} and \code{lambdas} both specify one value.}
#' \item{ic}{a \code{list} composed of two elements: under \code{"criterion"}, the type of information criterion used in the
#' calibration, and under \code{"matrix"}, a \code{matrix} of all information criterion values for \code{alphas} as rows
#' and the respective lambda values as columns (if \code{ctr$type != } "\code{cv}"). Any \code{NA} value in the latter
#' element means the specific information criterion could not be computed.}
#' \item{dates}{sample reference dates as a two-element \code{character} vector, being the earliest and most recent date from
#' the \code{sento_measures} object accounted for in the estimation window.}
#' \item{nVar}{a vector of size two, with respectively the number of sentiment measures, and the number of other explanatory
#' variables inputted.}
#' \item{discarded}{a named \code{logical} vector of length equal to the number of sentiment measures, in which \code{TRUE}
#' indicates that the particular sentiment measure has not been considered in the regression process. A sentiment measure is
#' not considered when it is a duplicate of another, or when at least 50\% of the observations are equal to zero.}
#'
#' @return If \code{ctr$do.iter = TRUE}, a \code{sento_modelIter} object which is a \code{list} containing:
#' \item{models}{all sparse regressions, i.e., separate \code{sento_model} objects as above, as a \code{list} with as names the
#' dates from the perspective of the sentiment measures at which the out-of-sample predictions are carried out.}
#' \item{alphas}{calibrated alphas.}
#' \item{lambdas}{calibrated lambdas.}
#' \item{performance}{a \code{data.frame} with performance-related measures, being "\code{RMSFE}" (root mean squared
#' forecasting error), "\code{MAD}" (mean absolute deviation), "\code{MDA}" (mean directional accuracy, in which's calculation
#' zero is considered as a positive; in p.p.), "\code{accuracy}" (proportion of correctly predicted classes in case
#' of a logistic regression; in p.p.), and each's respective individual values in the sample. Directional accuracy
#' is measured by comparing the change in the realized response with the change in the prediction between two consecutive time
#' points (omitting the very first prediction as \code{NA}). Only the relevant performance statistics are given
#' depending on the type of regression. Dates are as in the \code{"models"} output element, i.e., from the perspective of the
#' sentiment measures.}
#'
#' @seealso \code{\link{ctr_model}}, \code{\link[glmnet]{glmnet}}, \code{\link[caret]{train}}, \code{\link{attributions}}
#'
#' @examples
#' \dontrun{
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#' data("epu", package = "sentometrics")
#'
#' set.seed(505)
#'
#' # construct a sento_measures object to start with
#' corpusAll <- sento_corpus(corpusdf = usnews)
#' corpus <- quanteda::corpus_subset(corpusAll, date >= "2004-01-01")
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")])
#' ctr <- ctr_agg(howWithin = "counts", howDocs = "proportional",
#' howTime = c("equal_weight", "linear"),
#' by = "month", lag = 3)
#' sento_measures <- sento_measures(corpus, l, ctr)
#'
#' # prepare y and other x variables
#' y <- epu[epu$date %in% get_dates(sento_measures), "index"]
#' length(y) == nobs(sento_measures) # TRUE
#' x <- data.frame(runif(length(y)), rnorm(length(y))) # two other (random) x variables
#' colnames(x) <- c("x1", "x2")
#'
#' # a linear model based on the Akaike information criterion
#' ctrIC <- ctr_model(model = "gaussian", type = "AIC", do.iter = FALSE, h = 4,
#' do.difference = TRUE)
#' out1 <- sento_model(sento_measures, y, x = x, ctr = ctrIC)
#'
#' # attribution and prediction as post-analysis
#' attributions1 <- attributions(out1, sento_measures,
#' refDates = get_dates(sento_measures)[20:25])
#' plot(attributions1, "features")
#'
#' nx <- nmeasures(sento_measures) + ncol(x)
#' newx <- runif(nx) * cbind(data.table::as.data.table(sento_measures)[, -1], x)[30:40, ]
#' preds <- predict(out1, newx = as.matrix(newx), type = "link")
#'
#' # an iterative out-of-sample analysis, parallelized
#' ctrIter <- ctr_model(model = "gaussian", type = "BIC", do.iter = TRUE, h = 3,
#' oos = 2, alphas = c(0.25, 0.75), nSample = 75, nCore = 2)
#' out2 <- sento_model(sento_measures, y, x = x, ctr = ctrIter)
#' summary(out2)
#'
#' # plot predicted vs. realized values
#' p <- plot(out2)
#' p
#'
#' # a cross-validation based model, parallelized
#' cl <- parallel::makeCluster(2)
#' doParallel::registerDoParallel(cl)
#' ctrCV <- ctr_model(model = "gaussian", type = "cv", do.iter = FALSE,
#' h = 0, alphas = c(0.10, 0.50, 0.90), trainWindow = 70,
#' testWindow = 10, oos = 0, do.progress = TRUE)
#' out3 <- sento_model(sento_measures, y, x = x, ctr = ctrCV)
#' parallel::stopCluster(cl)
#' foreach::registerDoSEQ()
#' summary(out3)
#'
#' # a cross-validation based model for a binomial target
#' yb <- epu[epu$date %in% get_dates(sento_measures), "above"]
#' ctrCVb <- ctr_model(model = "binomial", type = "cv", do.iter = FALSE,
#' h = 0, alphas = c(0.10, 0.50, 0.90), trainWindow = 70,
#' testWindow = 10, oos = 0, do.progress = TRUE)
#' out4 <- sento_model(sento_measures, yb, x = x, ctr = ctrCVb)
#' summary(out4)}
#'
# @import glmnet
#' @export
sento_model <- function(sento_measures, y, x = NULL, ctr) {
check_class(sento_measures, "sento_measures")
if (!is.null(x)) {
stopifnot(is.data.frame(x) || is.matrix(x))
stopifnot(!is.null(colnames(x)))
stopifnot(unique(apply(x, 2, class)) == "numeric")
}
if (any(is.na(y))) stop("No NA values are allowed in y.")
if (length(unique(c(nobs(sento_measures), ifelse(is.null(nrow(y)), length(y), nrow(y)), nrow(x)))) != 1)
stop("Number of rows or length for y, x and measures in sento_measures must be equal.")
if (ctr$model == "binomial" && ifelse(is.factor(y), nlevels(y), NCOL(y)) > 2)
stop("At maximum two classes allowed in 'y' for a binomial model.")
if (ctr$model == "multinomial" && !(ifelse(is.factor(y), nlevels(y), NCOL(y)) > 2))
stop("At least three classes needed in 'y' for a multinomial model.")
family <- ctr$model
type <- ctr$type
do.intercept <- ctr$do.intercept
do.iter <- ctr$do.iter
h <- ctr$h
oos <- ctr$oos # used when type is "cv" or when do.iter is TRUE
do.difference <- ctr$do.difference
do.shrinkage.x <- ctr$do.shrinkage.x
alphas <- ctr$alphas
lambdas <- ctr$lambdas
do.progress <- ctr$do.progress
trainWindow <- ctr$trainWindow # used when type is "cv"
testWindow <- ctr$testWindow # used when type is "cv"
nSample <- ctr$nSample # used when do.iter is TRUE
start <- ctr$start # used when do.iter is TRUE
nCore <- ctr$nCore # used when do.iter is TRUE
if (!is.null(x)) {
nx <- ncol(x)
if (length(do.shrinkage.x) == 1) do.shrinkage.x <- rep(do.shrinkage.x, nx)
else if (length(do.shrinkage.x) != nx)
stop("The length of the 'do.shrinkage.x' argument is not in line with the number of columns in 'x'.")
} else do.shrinkage.x <- NULL
if (do.iter == TRUE) {
out <- run_sento_modelIter(sento_measures, y = y, x = x, h = h, family = family, do.intercept = do.intercept,
alphas = alphas, lambdas = lambdas, type = type, nSample = nSample,
start = start, oos = oos, trainWindow = trainWindow, testWindow = testWindow,
do.progress = do.progress, nCore = nCore, do.iter = do.iter,
do.difference = do.difference, do.shrinkage.x = do.shrinkage.x)
} else {
out <- run_sento_model(sento_measures, y = y, x = x, h = h, family = family, do.intercept = do.intercept,
alphas = alphas, lambdas = lambdas, type = type, trainWindow = trainWindow,
testWindow = testWindow, oos = oos, do.progress = do.progress,
do.difference = do.difference, do.shrinkage.x = do.shrinkage.x, do.iter = do.iter)
}
return(out)
}
.run_sento_model <- function(sento_measures, y, x, h, alphas, lambdas, do.intercept, trainWindow, testWindow,
oos, type, do.progress, family, do.difference, do.shrinkage.x, ...) {
# inputs i and nSample are NULL if one-shot model (not iterative)
dots <- list(...)
i <- dots$i
nSample <- dots$nSample
nx <- ifelse(is.null(x), 0, ncol(x))
nVar <- c(nmeasures(sento_measures), nx) # number of explanatory variables (before cleaning)
alignedVars <- align_variables(y, sento_measures, x, h, difference = do.difference, i = i, nSample = nSample)
yy <- alignedVars$y
xx <- alignedVars$x # xx includes sentiment measures and other variables
cleaned <- clean_panel(xx, nx) # drop duplicated and too sparse sentiment variables
xx <- cleaned$xNew
discarded <- cleaned$discarded
penalty <- c(rep(1, ncol(xx) - nx), as.numeric(do.shrinkage.x)) # 1 means shrinkage, 0 means no shrinkage
sampleDates <- c(alignedVars$datesX[1], alignedVars$datesX[nrow(xx)])
if (type == "cv") { # cross-validation
do.iter <- dots$do.iter
# train model based on slices in sliced
sliced <- create_cv_slices(1:nrow(yy), trainWindow, testWindow, skip = oos, do.reverse = FALSE)
ctrTrain <- caret::trainControl(index = sliced$train, indexOut = sliced$test,
allowParallel = ifelse(do.iter, FALSE, TRUE))
if (is.null(lambdas)) lambdas <- 10^seq(2, -2, length.out = 100) # default lambdas sequence for cross-validation
tuneGrid <- expand.grid(alpha = alphas, lambda = lambdas)
# change y variable to format required in caret::train() function
if (family == "gaussian") yyy <- yy[, 1]
else yyy <- as.factor(colnames(yy)[yy %*% 1:ncol(yy)])
# align training metric based on whether family is a linear or classification model
metric <- ifelse(family == "gaussian", "RMSE", "Accuracy")
if (nrow(tuneGrid) == 1) {
alphaOpt <- tuneGrid[1, 1]
lambdaOpt <- tuneGrid[1, 2]
outAdd <- NULL
} else {
if (do.progress == TRUE) cat("Training model... ")
trained <- caret::train(x = xx, y = yyy, method = "glmnet", family = family,
standardize = TRUE, penalty.factor = penalty,
trControl = ctrTrain, tuneGrid = tuneGrid, metric = metric)
if (do.progress == TRUE) cat("Done.", "\n")
alphaOpt <- as.numeric(trained$bestTune[1])
lambdaOpt <- as.numeric(trained$bestTune[2])
outAdd <- list(trained = trained)
}
} else { # information criterion
dfs <- as.list(numeric(length(alphas)))
for (i in seq_along(alphas)) {
alpha <- alphas[i]
if (do.progress == TRUE) {
if (length(alphas) == 1) cat("alphas run: ", alpha, "\n", sep = "")
else if (alpha == alphas[1]) cat("alphas run: ", alpha, ", ", sep = "")
else if (alpha == utils::tail(alphas, 1)) cat(alpha, "\n", sep = "")
else cat(alpha, ", ", sep = "")
}
reg <- glmnet::glmnet(x = xx, y = yy, penalty.factor = penalty, intercept = do.intercept,
alpha = alpha, lambda = lambdas, standardize = TRUE, family = family)
lambdasReg <- reg$lambda
xScaled <- scale(xx)
xA <- lapply(1:length(lambdasReg), function(j) return(as.matrix(xScaled[, which(reg$beta[, j] != 0)])))
yEst <- stats::predict(reg, newx = xx)
dfs[[i]] <- list(lambda = lambdasReg,
df = compute_df(alpha = alpha, lambda = lambdasReg, xA = xA), # C++ implementation
RSS = apply(yEst, 2, FUN = function(est) sum((yy - est)^2)))
}
# define function that pinpoints optimal alpha and lambda parameters
extract_optim_params <- function(dfs, y, ic, alphas) {
N <- max(sapply(dfs, function(df) return(length(df$lambda))))
lambdasMat <- dfsMat <- RSSMat <- matrix(NA, nrow = length(dfs), ncol = N)
for (i in 1:length(dfs)) { # for every alpha
df <- dfs[[i]] # list of lambdas, degrees of freedom and RSS
K <- length(df$lambda)
lambdasMat[i, 1:K] <- df$lambda
dfsMat[i, 1:K] <- unlist(df$df)
RSSMat[i, 1:K] <- df$RSS
}
idx <- suppressWarnings(which(dfsMat == max(dfsMat, na.rm = TRUE), arr.ind = TRUE))
if (dim(idx)[1] == 0) {
sigma2 <- NA
} else {
dfMax <- dfsMat[idx[1, 1], idx[1, 2]]
k <- (nrow(y) - ifelse(nrow(y) < dfMax, nrow(y), dfMax)) # correction not carried over in dfsMat
sigma2 <- RSSMat[idx[1, 1], idx[1, 2]] / k
}
if (ic == "BIC")
IC <- compute_BIC(y = y, dfA = dfsMat, RSS = RSSMat, sigma2 = sigma2)
else if (ic == "AIC")
IC <- compute_AIC(y = y, dfA = dfsMat, RSS = RSSMat, sigma2 = sigma2)
else if (ic == "Cp")
IC <- compute_Cp(y = y, dfA = dfsMat, RSS = RSSMat, sigma2 = sigma2)
rownames(IC) <- alphas
opt <- suppressWarnings(which(IC == min(IC, na.rm = TRUE), arr.ind = TRUE))
if (dim(opt)[1] == 0) {
return(list(IC = IC, lambda = lambdasMat[1, 1], alpha = alphas[1]))
warning("Computation of none of the information criteria
resolved. First alpha and lambda used as optimum.")
}
else {
return(list(IC = IC, lambda = lambdasMat[opt[1, 1], opt[1, 2]], alpha = alphas[opt[1, 1]]))
}
}
# retrieve optimal alphas and lambdas
params <- extract_optim_params(dfs, yy, type, alphas)
lambdaOpt <- params$lambda
alphaOpt <- params$alpha
outAdd <- list(ic = list(criterion = type, matrix = params$IC))
}
# actual elastic net optimization
regOpt <- glmnet::glmnet(x = xx, y = yy, penalty.factor = penalty, intercept = do.intercept,
lambda = lambdaOpt, alpha = alphaOpt, standardize = TRUE, family = family)
out <- c(list(reg = regOpt,
model = family,
alpha = alphaOpt,
lambda = lambdaOpt,
dates = sampleDates,
nVar = nVar,
discarded = discarded),
outAdd)
class(out) <- "sento_model"
return(out)
}
#' @importFrom compiler cmpfun
run_sento_model <- compiler::cmpfun(.run_sento_model)
#' @importFrom foreach %dopar%
.run_sento_modelIter <- function(sento_measures, y, x, h, family, do.intercept, alphas, lambdas,
type, nSample, start, trainWindow, testWindow, oos, do.progress,
nCore, do.iter, do.difference, do.shrinkage.x) {
nIter <- ifelse(is.null(nrow(y)), length(y), nrow(y)) - nSample - abs(h) - oos
if (nIter <= 0 || start > nIter)
stop("Data not sufficient to do at least one iteration for given sample size, horizon, out-of-sample skip and start.")
# perform all regressions
if (nCore > 1) {
if (!all(sapply(c("parallel", "doParallel"), requireNamespace, quietly = TRUE))) {
stop("Packages 'parallel' and 'doParallel' needed for parallel iterative model estimation. Please install.")
}
cl <- parallel::makeCluster(min(parallel::detectCores(), nCore))
doParallel::registerDoParallel(cl)
regsOpt <- foreach::foreach(i = start:nIter,
.packages = "sentometrics",
.export = c("run_sento_model")) %dopar% {
out <- run_sento_model(
sento_measures, y, x, h, alphas = alphas, lambdas = lambdas, do.intercept = do.intercept,
trainWindow = trainWindow, testWindow = testWindow, oos = oos, type = type, do.progress = FALSE,
family = family, do.iter = do.iter, do.difference = do.difference, do.shrinkage.x = do.shrinkage.x,
nSample = nSample, i = i
)
return(out)
}
parallel::stopCluster(cl)
foreach::registerDoSEQ()
} else {
regsOpt <- lapply(start:nIter, function(i) {
if (do.progress == TRUE) cat("Iteration: ", (i - start + 1), " from ", (nIter - start + 1), "\n", sep = "")
out <- run_sento_model(
sento_measures, y, x, h, alphas = alphas, lambdas = lambdas, do.intercept = do.intercept,
trainWindow = trainWindow, testWindow = testWindow, oos = oos, type = type, do.progress = do.progress,
family = family, do.iter = do.iter, do.difference = do.difference, do.shrinkage.x = do.shrinkage.x,
nSample = nSample, i = i
)
return(out)
})
}
# get optimal alphas and lambdas
alphasOpt <- sapply(regsOpt, "[[", "alpha")
lambdasOpt <- sapply(regsOpt, "[[", "lambda")
# prepare for and get all predictions
alignedVarsAll <- align_variables(y, sento_measures, x, h, difference = do.difference)
oosRun <- start:nIter + nSample + oos
xPred <- alignedVarsAll$x[oosRun, , drop = FALSE]
yReal <- alignedVarsAll$y[oosRun, , drop = FALSE]
datesX <- alignedVarsAll$datesX[oosRun] # dates from perspective of x at which forecasts are made
names(regsOpt) <- datesX
if (family %in% c("binomial", "multinomial")) {
n <- length(colnames(yReal)) # number of factor levels
yEst <- matrix(rep(NA, n * (nIter - start + 1)), ncol = n)
colnames(yEst) <- colnames(yReal)
yEstClass <- rep(NA, nIter - start + 1)
} else {
yEst <- rep(NA, nIter - start + 1)
yEstClass <- NULL
}
for (j in 1:(nIter - start + 1)) {
reg <- regsOpt[[j]]
newx <- xPred[j, , drop = FALSE]
if (family == "gaussian") {
yEst[j] <- stats::predict(reg, newx = newx, type = "link")
} else if (family == "binomial") {
yEst[j, 2] <- stats::predict(reg, newx = newx, type = "response") # second factor
yEst[j, 1] <- 1 - yEst[j, 2]
yEstClass[j] <- stats::predict(reg, newx = newx, type = "class")
} else if (family == "multinomial") {
yEst[j, ] <- stats::predict(reg, newx = newx, type = "response")[1, , ]
yEstClass[j] <- as.character(stats::predict(reg, newx = newx, type = "class"))
}
}
# compute model performance
performance <- model_performance(yEst = yEst, yReal = yReal, family = family, dates = datesX, yEstClass = yEstClass)
out <- list(models = regsOpt,
alphas = alphasOpt,
lambdas = lambdasOpt,
performance = performance)
class(out) <- "sento_modelIter"
return(out)
}
#' @importFrom compiler cmpfun
run_sento_modelIter <- compiler::cmpfun(.run_sento_modelIter)
model_performance <- function(yEst, yReal, family, dates, ...) {
dots <- list(...)
if (family == "gaussian") {
dirAcc <- c(NA, as.numeric(sign(diff(yReal)) == sign(diff(yEst))))
error <- yEst - yReal
error2 <- error^2
absDev <- abs(error)
errors <- data.frame(cbind(dirAcc, error, error2, absDev))
colnames(errors) <- c("DA", "error", "errorSq", "AD")
meanErrors <- colMeans(errors[, -2], na.rm = TRUE)
raw <- data.frame(response = yReal, predicted = yEst, errors)
row.names(raw) <- dates
errorsAll <- list(raw = raw,
MDA = as.numeric(meanErrors["DA"]) * 100,
RMSFE = as.numeric(sqrt(meanErrors["errorSq"])),
MAD = as.numeric(meanErrors["AD"]))
} else if (family %in% c("binomial", "multinomial")) {
yRealClass <- as.factor(colnames(yReal)[yReal %*% 1:ncol(yReal)])
yEstClass <- dots$yEstClass
accuracy <- as.numeric(yRealClass == yEstClass)
accuracyProb <- (sum(accuracy)/length(accuracy)) * 100
raw <- data.frame(response = yRealClass, predicted = yEstClass, accuracy = accuracy)
row.names(raw) <- dates
errorsAll <- list(raw = raw, accuracy = accuracyProb)
}
return(errorsAll)
}
#' @export
summary.sento_model <- function(object, ...) {
sento_model <- object
reg <- sento_model$reg
if ("ic" %in% names(sento_model)) {
printCalib <- paste0("via ", sento_model$ic[[1]], " information criterion")
} else {
printCalib <- paste0("via cross-validation; ",
"ran through ", nrow(sento_model$trained$resample), " samples of size ",
length(sento_model$trained$control$index[[1]]),
", selection based on ", sento_model$trained$metric, " metric")
}
cat("Model specification \n")
cat(rep("-", 20), "\n \n")
cat("Model type:", sento_model$model, "\n")
cat("Calibration:", printCalib, "\n")
cat("Number of observations:", reg$nobs, "\n")
cat("Optimal elastic net alpha parameter:", round(sento_model$alpha, 2), "\n")
cat("Optimal elastic net lambda parameter:", round(reg$lambda, 2), "\n \n")
if (sento_model$model != "multinomial") {
cat("Non-zero coefficients \n")
cat(rep("-", 20), "\n")
print(nonzero_coeffs(reg))
} else {
cat("Number of non-zero coefficients per level (excl. intercept, incl. non-sentiment variables) \n")
cat(rep("-", 20), "\n")
nonZeros <- as.data.frame(sento_model$reg$dfmat)
colnames(nonZeros) <- NULL
print(nonZeros)
}
cat()
}
#' @export
print.sento_model <- function(x, ...) {
cat("A sento_model object.", "\n")
}
#' @export
summary.sento_modelIter <- function(object, ...) {
sento_modelIter <- object
sento_model <- sento_modelIter$models[[1]] # first sento_model object as representative object
model <- sento_model$model
reg <- sento_model$reg
if ("ic" %in% names(sento_model)) {
printCalib <- paste0("via ", sento_model$ic[[1]], " information criterion")
} else {
printCalib <- paste0("via cross-validation; ",
"Ran through ", nrow(sento_model$trained$resample), " samples of size ",
length(sento_model$trained$control$index[[1]]),
", selection based on ", sento_model$trained$metric, " metric")
}
cat("Model specification \n")
cat(rep("-", 20), "\n \n")
cat("Model type:", sento_model$model, "\n")
cat("Calibration:", printCalib, "\n")
cat("Sample size:", reg$nobs, "\n")
cat("Total number of iterations/predictions:", length(sento_modelIter$models), "\n")
cat("Optimal average elastic net alpha parameter:", round(mean(sento_modelIter$alphas, na.rm = TRUE), 2), "\n")
cat("Optimal average elastic net lambda parameter:", round(mean(sento_modelIter$lambdas, na.rm = TRUE), 2), "\n \n")
cat("Out-of-sample performance \n")
cat(rep("-", 20), "\n \n")
if (model == "gaussian") {
cat("Mean directional accuracy:", round(sento_modelIter$performance$MDA, 2), "% \n")
cat("Root mean squared prediction error:", round(sento_modelIter$performance$RMSFE, 2), "\n")
cat("Mean absolute deviation:", round(sento_modelIter$performance$MAD, 2))
} else {
cat("Accuracy:", sento_modelIter$performance$accuracy, "%")
}
cat("\n")
}
#' @export
print.sento_modelIter <- function(x, ...) {
cat("A sento_modelIter object.", "\n")
}
#' Plot iterative predictions versus realized values
#'
#' @author Samuel Borms
#'
#' @method plot sento_modelIter
#'
#' @description Displays a plot of all predictions made through the iterative model computation as incorporated in the
#' input \code{sento_modelIter} object, as well as the corresponding true values.
#'
#' @details See \code{\link{sento_model}} for an elaborate modeling example including the plotting of out-of-sample
#' performance.
#'
#' @param x a \code{sento_modelIter} object created using \code{\link{sento_model}}.
#' @param ... not used.
#'
#' @return Returns a simple \code{\link{ggplot}} object, which can be added onto (or to alter its default elements) by using
#' the \code{+} operator.
#'
#' @import ggplot2
#' @export
plot.sento_modelIter <- function(x, ...) {
sento_modelIter <- x
mF <- sento_modelIter$models[[1]]$model
if (mF == "gaussian") {
plotter <- geom_line()
scaleY <- scale_y_continuous(name = "Response")
} else {
plotter <- geom_point()
scaleY <- scale_y_discrete(name = "Response")
}
data <- data.table::data.table(date = row.names(sento_modelIter$performance$raw),
realized = sento_modelIter$performance$raw$response,
prediction = sento_modelIter$performance$raw$predicted)
if (mF != "gaussian") data[, 2:3] <- lapply(data[, 2:3], as.character)
melted <- data.table::melt(data, id.vars = "date")
p <- ggplot(data = melted, aes(x = as.Date(date), y = value, color = variable)) +
plotter +
scale_x_date(name = "Date", date_labels = "%m-%Y") +
scaleY +
theme_bw() +
plot_theme(legendPos = "top")
p
}
#' Make predictions from a sento_model object
#'
#' @author Samuel Borms
#'
#' @description Prediction method for \code{sento_model} class, with usage along the lines of
#' \code{\link{predict.glmnet}}, but simplified in terms of parameters.
#'
#' @param object a \code{sento_model} object created with \code{\link{sento_model}}.
#' @param newx a data \code{matrix} used for the prediction(s), row-by-row; see
#' \code{\link{predict.glmnet}}. The number of columns should be equal to \code{sum(sento_model$nVar)}, being the
#' number of original sentiment measures and other variables. The variables discarded in the regression process are
#' dealt with within this function, based on \code{sento_model$discarded}.
#' @param type type of prediction required, a value from \code{c("link", "response", "class")}, see documentation for
#' \code{\link{predict.glmnet}}.
#' @param offset not used.
#' @param ... not used.
#'
#' @return A prediction output depending on the \code{type} argument.
#'
#' @seealso \code{\link{predict.glmnet}}, \code{\link{sento_model}}
#'
#' @export
predict.sento_model <- function(object, newx, type = "response", offset = NULL, ...) {
stopifnot(is.matrix(newx))
reg <- object$reg
discarded <- object$discarded
n <- sum(object$nVar)
if (n != ncol(newx)) stop("Number of columns in 'newx' not equal to the required number of input variables.")
idx <- c(!discarded, rep(TRUE, (n - length(discarded)))) # TRUE means variable to keep for prediction
newx <- newx[, idx, drop = FALSE]
pred <- stats::predict(reg, newx = newx, type = type, offset = NULL)
return(pred)
}
#' Retrieve loss data from a selection of models
#'
#' @author Samuel Borms
#'
#' @description Structures specific performance data for a set of different \code{sento_modelIter} objects as loss data.
#' Can then be used, for instance, as an input to create a model confidence set (Hansen, Lunde and Nason, 2011) with
#' the \pkg{MCS} package.
#'
#' @param models a named \code{list} of \code{sento_modelIter} objects. All models should be of the same family, being
#' either \code{"gaussian"}, \code{"binomial"} or \code{"multinomial"}, and have performance data of the same dimensions.
#' @param loss a single \code{character} vector, either \code{"DA"} (directional \emph{in}accuracy), \code{"error"}
#' (predicted minus realized response variable), \code{"errorSq"} (squared errors), \code{"AD"} (absolute errors) or
#' \code{"accuracy"} (\emph{in}accurate class predictions). This argument defines on what basis the model confidence set
#' is calculated. The first four options are available for \code{"gaussian"} models, the last option applies only to
#' \code{"binomial"} and \code{"multinomial"} models.
#'
#' @return A \code{matrix} of loss data.
#'
#' @seealso \code{\link{sento_model}}, \code{\link[MCS]{MCSprocedure}}
#'
#' @references Hansen, Lunde and Nason (2011). \strong{The model confidence set}. \emph{Econometrica 79, 453-497},
#' \doi{10.3982/ECTA5771}.
#'
#' @examples
#' \dontrun{
#' data("usnews", package = "sentometrics")
#' data("list_lexicons", package = "sentometrics")
#' data("list_valence_shifters", package = "sentometrics")
#' data("epu", package = "sentometrics")
#'
#' set.seed(505)
#'
#' # construct two sento_measures objects
#' corpusAll <- sento_corpus(corpusdf = usnews)
#' corpus <- quanteda::corpus_subset(corpusAll, date >= "1997-01-01" & date < "2014-10-01")
#' l <- sento_lexicons(list_lexicons[c("LM_en", "HENRY_en")], list_valence_shifters[["en"]])
#'
#' ctrA <- ctr_agg(howWithin = "proportionalPol", howDocs = "proportional",
#' howTime = c("equal_weight", "linear"), by = "month", lag = 3)
#' sentMeas <- sento_measures(corpus, l, ctrA)
#'
#' # prepare y and other x variables
#' y <- epu[epu$date %in% get_dates(sentMeas), "index"]
#' length(y) == nobs(sentMeas) # TRUE
#' x <- data.frame(runif(length(y)), rnorm(length(y))) # two other (random) x variables
#' colnames(x) <- c("x1", "x2")
#'
#' # estimate different type of regressions
#' ctrM <- ctr_model(model = "gaussian", type = "AIC", do.iter = TRUE,
#' h = 0, nSample = 120, start = 50)
#' out1 <- sento_model(sentMeas, y, x = x, ctr = ctrM)
#' out2 <- sento_model(sentMeas, y, x = NULL, ctr = ctrM)
#' out3 <- sento_model(subset(sentMeas, select = "linear"), y, x = x, ctr = ctrM)
#' out4 <- sento_model(subset(sentMeas, select = "linear"), y, x = NULL, ctr = ctrM)
#'
#' lossData <- get_loss_data(models = list(m1 = out1, m2 = out2, m3 = out3, m4 = out4),
#' loss = "errorSq")
#'
#' mcs <- MCS::MCSprocedure(lossData)}
#'
#' @export
get_loss_data <- function(models, loss = c("DA", "error", "errorSq", "AD", "accuracy")) {
# check if input is consistent
stopifnot(is.list(models))
stopifnot(loss %in% c("DA", "error", "errorSq", "AD", "accuracy"))
if (length(loss) != 1) stop("The 'loss' argument should contain a single argument.")
checkClass <- sapply(models, function(m) return(!inherits(m, "sento_modelIter")))
if (any(checkClass)) stop("Not all elements of the 'models' list are sento_modelIter objects.")
modelFamilies <- unlist(lapply(models, function(m) return(m$models[[1]]$model)))
if (!(length(table(modelFamilies)) == 1)) stop("All models should come from the same family.")
mF <- as.character(modelFamilies[1])
if (!all(sapply(models, function(m) length(m$models)) == length(models[[1]][["models"]])))
stop("All models should contain the same number of iterations.")
checkGaussian <- (mF == "gaussian" & (loss %in% c("DA", "error", "errorSq", "AD")))
checkLogistic <- (mF %in% c("binomial", "multinomial") & (loss == "accuracy"))
if (!(checkGaussian | checkLogistic))
stop("The 'loss' argument is not in line with the model family.")
# extract loss data
lossData <- matrix(unlist(lapply(models, function(m) m$performance$raw[[loss]]), use.names = FALSE), ncol = length(models))
colnames(lossData) <- names(models)
if (loss %in% c("DA", "accuracy")) lossData <- abs(lossData - 1) # from accuracy to inaccuracy
if (loss == "DA") lossData <- lossData[-1, , drop = FALSE] # get rid of first NA values
if (any(duplicated(t(lossData)))) warning("Loss data across some of the models is duplicated.")
return(lossData)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.