knitr::opts_chunk$set(echo = TRUE)
This is just a document store of Jasen's progress using FRED data to make forecasts of economic data. We use lm() inside SWfore to make forecasts using lagged and un-lagged values of the x-values, as well as using a Random Forest model from the ranger package.
As of now, we are using only FRED data. Below are the periodicities of the data we have retrieved thus far. There are 3 quarterly series. We should review the methodology from the paper/s. For now i am converting to quarterly. Also note 2 of the series start in 1938 and end in 1940/41. We will remove those series.
### Add libraries library(quantmod) #Create new environment fred_data <- new.env() ### Add symbols symbols <- c('OECDLOLITOAASTSAM', 'ICSA', 'INDPRO', 'T10Y2Y', 'BAA10Y', # Economic Trend 'NFCI', # Liquidity 'M2V', 'TOTCI', 'M0263AUSM500NNBR', 'M0264AUSM500NNBR', 'BOGZ1FA895050005Q', # Velocity 'UMCSENT', 'CSUSHPISA', 'SPCS20RSA', # Confirmation Bias: Surveys 'VIXCLS', 'VXVCLS', 'EVZCLS', 'THREEFYTP10', # Representative Bias 'EMVOVERALLEMV', # Cognitive Dissonance 'CFNAIMA3', 'STLENI', 'STLFSI2', # Economic Surprise 'WLEMUINDXD' # Geopolitics ) ### Get data getSymbols(Symbols = symbols, src='FRED', env = fred_data) ### Merge data - this will store it in a list data <- eapply(env = fred_data, FUN = merge.xts) #### get periodicity periodicities <- as.data.frame(do.call(rbind, lapply(data, periodicity))) # periodicities # view periodicities ls_periodicities <- lapply(data, periodicity) periodicities$start <- as.Date(sapply(ls_periodicities, "[[", 3)) periodicities$end <- as.Date(sapply(ls_periodicities, "[[", 4)) periodicities # view periodicities
data_quart <- lapply(data, to.quarterly, OHLC=FALSE) data_quart[["M0264AUSM500NNBR"]] <- NULL # remove from dataset data_quart[["M0263AUSM500NNBR"]] <- NULL # remove from dataset
At this point, converting to quarterly gives us warning messages about missing values being removed. We will have to audit that for each symbol. For now, let's move onto a merge.
xts_data_quart <- do.call(merge.xts, data_quart)
Now we should be able to make forecasts with MTS and SWfore. However, we will need to identify the most complete data first, as eigen() dont take no NAs.
Looks like we have NAs until 2013 Q2, and the last 2 observations are also not complete cases...we will need to remove those. We can use complete.cases to build an index of complete rows.
It would appear the SWfore requires forecasts of at least 2 single one-step's ahead otherwise the matrix multiplication math does not work due to non-conformable arrays. If we want code to be able to make a single one-step forecast, we will need to write our own function. Not sure its necessary, but boiler plate code included in the below chunk if we need to. The forecast horizons from S&W were 6, 12 and 24 months. From Coulombe et al it was 1, 3, 9, 12 and 24 months...so if we want to forecast a single step-ahead we will need to edit the SWfore function for our purposes.
# "SWfore" <- function(y,x,orig,m){ # ### Performs Stock and Watson's diffusion index prediction # ### y: dependent variable # ### x: observed regressors # ### orig: forecast origin # ### m: selected number of PCs # ### # ### Output: Forecasts and MSE of forecasts (if data available) # if(!is.matrix(x))x=as.matrix(x) # nT=dim(x)[1] # k=dim(x)[2] # if(orig > nT)orig=nT # if(m > k)m=k; if(m < 1)m=1 # # standardize the predictors # x1=x[1:orig,] # me=apply(x1,2,mean) # se=sqrt(apply(x1,2,var)) # x1=x # for (i in 1:k){ # x1[,i]=(x1[,i]-me[i])/se[i] # } # # # V1=cov(x1[1:orig,]) # m1=eigen(V1) # sdev=m1$values # M=m1$vectors # M1=M[,1:m] # Dindex=x1%*%M1 # y1=y[1:orig]; DF=Dindex[1:orig,] # mm=lm(y1~DF) # # coef=matrix(mm$coefficients,(m+1),1) # coef=matrix(mm$coefficients[-1],(m),1) # exclude the intercept # #cat("coefficients: ","\n") # #print(round(coef,4)) # yhat=NULL; MSE=NULL # if(orig < nT){ # newx=cbind(rep(1,(nT-orig)),Dindex[(orig+1):nT,]) # yhat=mm$coefficients[1]+(t(newx)%*%coef) # err=y[(orig+1):nT]-yhat # MSE=mean(err^2) # cat("MSE of out-of-sample forecasts: ",MSE,"\n") # } # # SWfore <- list(coef=coef,yhat=yhat,MSE=MSE,loadings=M1,DFindex=Dindex) # }
require(MTS) idx_complete <- which(complete.cases(xts_data_quart) == TRUE) trim_data = xts_data_quart[idx_complete] results <- SWfore(y = trim_data$UMCSENT, x = trim_data[,-c(1)], orig = length(idx_complete)-2, m = 5) # results <- SWfore(y = trim_data$UMCSENT, x = trim_data[,-1], orig = length(idx_complete)-1, m = 5) cbind(trim_data$UMCSENT[(length(idx_complete)-2+1):length(idx_complete)], results$yhat)
Ok, so we get SWFore() to work (requires more than 1 forecast) which uses lm() to make the forecast. Forecasting in SWFore() is optional. At the very least the function will fit a linear model to the entire dataset, where no forecast is required. We get an MSE of 41 when predicting 2 quarters (ie. 2 datapoints) ahead.
Its possible we have introduced look-ahead bias here...so we should test lagged x values
results <- SWfore(y = trim_data$UMCSENT[-1], x = lag(trim_data[,-c(1)], 1)[-1], orig = length(trim_data$UMCSENT)-3, m = 5) cbind(trim_data$UMCSENT[(length(trim_data$UMCSENT)-2+1):length(trim_data$UMCSENT)], results$yhat)
Ok, the MSE is fugly. How does an RF model compare?
Before extending the function with additional parameters for model choice, which feels like a logical next step, we could use a modified version of SWFore() to simply build the diffusion indexes, then pass the resulting series' to a new forecasting algorithm such as a random forest model. We could compare results with lm.
The combinations of forecasting horizons and target variables can explode pretty quickly...so we will have to narrow it down to something specific. Will use a few forecast horizons and the UMCSENT target variable for starters.
Below is a snippet of the Diffusion Indexes.
"DiffIdx" <- function(x,orig,m){ ### Builds Stock and Watson's diffusion index prediction ### x: observed regressors ### orig: forecast origin ### m: selected number of PCs ### ### Output: Forecasts and MSE of forecasts (if data available) if(!is.matrix(x))x=as.matrix(x) nT=dim(x)[1] k=dim(x)[2] if(orig > nT)orig=nT if(m > k)m=k; if(m < 1)m=1 # standardize the predictors x1=x[1:orig,] me=apply(x1,2,mean) se=sqrt(apply(x1,2,var)) x1=x for (i in 1:k){ x1[,i]=(x1[,i]-me[i])/se[i] } # V1=cov(x1[1:orig,]) m1=eigen(V1) sdev=m1$values M=m1$vectors M1=M[,1:m] Dindex=x1%*%M1 # y1=y[1:orig] # DF=Dindex[1:orig,] DF=Dindex # mm=lm(y1~DF) # coef=matrix(mm$coefficients,(m+1),1) # coef=matrix(mm$coefficients[-1],(m),1) # exclude the intercept #cat("coefficients: ","\n") #print(round(coef,4)) yhat=NULL; MSE=NULL # if(orig < nT){ # newx=cbind(rep(1,(nT-orig)),Dindex[(orig+1):nT,]) # yhat=mm$coefficients[1]+(t(newx)%*%coef) # err=y[(orig+1):nT]-yhat # MSE=mean(err^2) # cat("MSE of out-of-sample forecasts: ",MSE,"\n") # } DiffIdx <- DF } diffIdx <- DiffIdx(trim_data[,-c(1)], orig = length(idx_complete)-2, m = 5) head(diffIdx)
Ok, we have our Diffusion Indexes. Now lets feed these X values together with Y values into a RF model. We will use the ranger package for this Random Forest implementation.
# Refer to https://uc-r.github.io/random_forests require(ranger) set.seed(123) # Lag x values, build x_train and y_train lag_x <- lag.xts(diffIdx) x_train <- lag_x[2:(nrow(diffIdx)-2),] # train goes from 32 obs to 29, 1 lag and 2 obs for prediction colnames(x_train) <- c("DI_1","DI_2","DI_3","DI_4", "DI_5") # nrow(x_train) y_train <- trim_data$UMCSENT[2:(nrow(trim_data)-2)] # nrow(y_train) # Check lagged x and y values MDI_train <- cbind(y_train, x_train) x_test <- lag_x[(nrow(diffIdx)-1):nrow(diffIdx),] colnames(x_test) <- c("DI_1","DI_2","DI_3","DI_4","DI_5") # nrow(x_test) y_test <- trim_data$UMCSENT[(nrow(trim_data)-1):nrow(trim_data)] # nrow(y_test) MDI_test <- cbind(x_test, y_test) MDI_test MDI_ranger <- ranger( formula = UMCSENT ~ ., data = MDI_train, num.trees = 500, mtry = 5 )
Before moving onto predicting out-of-sample observations, lets check the head and tail of the lagged and original un-lagged datasets...to make sure we are all in alignment...
# Compare lagged and un-lagged datasets head(cbind(MDI_train, cbind(trim_data$UMCSENT, diffIdx))) tail(cbind(MDI_train, cbind(trim_data$UMCSENT, diffIdx)))
We will need to think about what hyperparameter grid search we would like to do...
Moving onto prediction...
pred_ranger <- predict(MDI_ranger, MDI_test) head(pred_ranger$predictions) err <- y_test - pred_ranger$predictions MSE <- mean(err^2) MSE cbind(y_test, pred_ranger$predictions)
Now, RF model does better with an MSE of 84 versus 343 for lm in SWFore when using lagged data removing any potential look-ahead bias. Ignoring look-ahead bias, we know SWfore and lm() achieve an MSE of 41. How does RF do with look-ahead bias, compared with lm()?
x_train <- diffIdx[1:(nrow(diffIdx)-3),] # train goes from 32 obs to 29, 1 lag and 2 obs for prediction colnames(x_train) <- c("DI_1","DI_2","DI_3","DI_4", "DI_5") # nrow(x_train) y_train <- trim_data$UMCSENT[1:(nrow(trim_data)-3)] # nrow(y_train) x_test <- diffIdx[(nrow(diffIdx)-2):(nrow(diffIdx)-1),] colnames(x_test) <- c("DI_1","DI_2","DI_3","DI_4","DI_5") # nrow(x_test) y_test <- trim_data$UMCSENT[(nrow(trim_data)-2):(nrow(trim_data)-1)] # nrow(y_test) MDI_train <- cbind(x_train, y_train) set.seed(123) MDI_ranger <- ranger( formula = UMCSENT ~ ., data = MDI_train, num.trees = 500, mtry = 5 ) MDI_test <- cbind(x_test, y_test) pred_ranger <- predict(MDI_ranger, MDI_test) head(pred_ranger$predictions) err <- y_test - pred_ranger$predictions MSE <- mean(err^2) MSE cbind(y_test, pred_ranger$predictions)
Well, RF does worse when left un-adjusted for look-ahead bias whereas lm() does better...but still worse than RF in both cases.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.