####################################################################################
# FILE wfo_grow_forest.R
#
#
# NEXT STEPS: Test wfo_grow_forest with a basic testthat, and
# make sure it has basic tests for the wfo_subsets, while
# ensuring there is no lookahead bias.
# Also, add the adjusted (no lookahead) equity curve code to the function
# and return list.
####################################################################################
# FUNCTION wfo_grow_forest
#
#' Performs walk-forward optimization using the random forest algorithm
#'
#' This function builds a random forest model at each optimization point in time for
#' a given set of parameters. It is single-threaded and builds a single model at each
#' point in time. To run multiple models at each point in time using multiple CPUs,
#' use function wfo_grow_jungle.
#'
#'
#' @param featuremat The xts matrix previously generated using function `make_featuremat`.
#' It should contain the target y in column 1, and all the predictors
#' (features) in the other columns. It is a daily xts matrix.
#'
#' @param modelwindow Size in days for the rolling modeling window. Each model is built
#' at the dates specified by wfo_span and uses a data window data of
#' this size that includes the wfo_span date.
#'
#' @param wfo_span Specifies the points in time when new models are optimized for
#' walk-forward optimization. This uses the endpoints function to
#' extract the dates from the featuremat index. Valid values include
#' 'days', 'weeks', 'months' and 'quarters'. If 'days' is specified,
#' then function endpoints is not used since the data already has
#' daily granularity. Default is "months".
#'
#' @param wfo_subset Use to specify how to run a over a subset of featuremat dates.
#' Normally used by \strong{wfo_grow_jungle} to spawn multiple
#' parallel jobs, each covering a subset of dates. The dates should
#' specified as an xts timeframe as follows:
#' \strong{<start date>/<end date>}. In order to ensure no date is
#' skipped, both start date and end date must align on a wfo_span
#' date. In addition, the start date for a subsequent job should be
#' identical to the end date of the previous job. This would correspond
#' to a prediction date in the previous job at <end date>, whereas it
#' would correspond to a model training date for the subsequent job
#' at <start date>. The default is NA which means no subsetting is
#' performed.
#'
#' @param SPwindow The rolling window size in days used to calculate the SP_score. See
#' below for details on the SPscore.
#'
#' @param PQwindow The folling windo size in days used to calculate the PQ_score.
#' See below for details on how the PQ_score is calculated.
#'
#' @param jobname If specified, then the results are stored in a list with a single
#' item in it, with the jobname as its name. This is normally used
#' when multiple parallel calls are made to wfo_grow_forest as it
#' provides a simple method to combine all results. See Value
#' section below for details.
#'
#' @param mtry Parameter mtry passed on to randomForest function. This is the
#' number of features randomly selected for each tree. Defautl is 2.
#'
#' @param ntree Parameter ntree passed on to randomForest function. This is the
#' number of trees generated for each model. Default is 1000.
#'
#' @param importance Parameter importance passed on to randomForest function. This is a
#' logical specifying whether to output the variable importance.
#' Default is TRUE.
#'
#' @param na.action Parameter passed on to randomForest to determine what to do with
#' NAs. Default is na.omit.
#'
#' @param ... Additional parameters passed on to randomForest function.
#'
#'
#' @return If jobname is specified, then the function wraps all list items below into
#' a single element list, where the elemenat is named by argument jobname.
#' Otherwise, it skips the wrapping and simple returns a list with the following items:
#'
#' \describe{
#' \item{\preformatted{$pred}}{
#' A daily xts matrix containing the prediction information. The index represent the prediction
#' date. The matrix has the following columns:
#' \itemize{
#' \item
#' \strong{y } The target value against which each model are being trained, as
#' specified by column y in argument `featuremat`. For example,
#' this could be a future return for a certain period, properly lagged
#' in time.
#' \item
#' \strong{ypred } The value predicted by the model trained using the most recent
#' date up to but not including the current period.
#' \item
#' \strong{samesign } This is a logical series comparing the signs between y and ypred
#' to see if they are the same. It is useful to convert the
#' random forest regression values (ypred) to a classification model.
#'
#' \item
#' \strong{SP_score } The Stability of Prediction score expressed as the number of daily
#' prediction sign changes over a rolling window of size `SPwindow`. A
#' stable prediction should have a small SPscore such as < 2 or 3 for a
#' reasonably sized window. An unstable model will tend to oscillate and
#' should be viewed as indecisive.
#' \item
#' \strong{PQ_score} The Prediction Quality score expressed as the number of daily prediction
#' that were of the same sign over a rolling window of size `PQwindow`.
#' The PQ_score is expressed as the ratio of accurate predictions over
#' the total number of predictions (PQwindow). Therefore if all predictions
#' were correct over a given PQwindow, then PQ_score = 1. A PQ_score
#' that is meaningfully below 1 should be viewed as bad because it means
#' the most recent models don't predict very well.
#'
#'
#' }
#'
#' }
#'
#' \item{\preformatted{$wfo_dates}}{
#' \strong{A vector of time indices} containing the wfo optimization dates. In other words,
#' these are the dates at which a new model was optimized.
#' }
#'
#'
#' }
#'
#' @export
wfo_grow_forest2 <- function(featuremat, modelwindow = 252, wfo_span = "months",
wfo_subset = NA, SPwindow = 63, jobname = NA,
mtry = 2, ntree = 1000, importance = TRUE,
na.action = na.omit, ...) {
#--------------------------------------------------------
# Remove leading NAs, then extract the wfo_span dates
#--------------------------------------------------------
featuremat <- featuremat[complete.cases(featuremat), ]
# Extract the wfo_span date: remove 0 and most recent date.
# since we can't develop a model on the last date (target y = NA)
# --Keep most recent date and in loop, set a flag such that this
# last date is a prediction only (skip the new model)
wfo_dates <- endpoints(featuremat, on = wfo_span)[-1]
# Subset wfo_dates further to ensure data exists on first training date
start_win <- wfo_dates - modelwindow + 1
wfo_dates <- wfo_dates[start_win > 0]
# If wfo_subset is specified, then further subset the wfo_span dates
# and truncate featuremat appropriately
if(!is.na(wfo_subset)) {
sub_dates <- unlist(stringr:str_split(wfo_subset, "/"))
# Check for bad wfo_subset arguments and throw an error if misaligned.
if(!(as.Date(sub_dates[1]) %in% index(featuremat[wfo_dates, ]) &&
as.Date(sub_dates[2]) %in% index(featuremat[wfo_dates, ]) )) {
stop("wfo_grow_forest: Arguments wfo_subset dates don't align with wfo_span dates.")
}
featuremat <- featuremat[1:featuremat[sub_dates[2], , which.i = TRUE]]
wfo_span_dates <- index(featuremat[wfo_subset, ])
wfo_span <- featuremat[wfo_span_dates, , which.i = TRUE]
sprint("DIAGNOSTIC: wfo_span_dates and wfo_span are:")
print(wfo_span_dates)
print(wfo_span)
}
N <- nrow(featuremat)
### HERE subset wfo_dates further using wfo_subset timeframe
### Also reset N to be the last date on featuremat by truncating
### featuremat
# Ensure the featuremat column names are valid before calling
# main Random Forest loop.
colnames(featuremat) <- make.names(colnames(featuremat), unique = TRUE)
#---------------------------------------------------------
# Set up the results xts matrix then loop
# on each modeling date
#---------------------------------------------------------
res <- featuremat[(wfo_dates[1] + 1):Nlast, 1, drop = FALSE]
res$ypred <- NA
Ndates <- length(wfo_dates)
for(i in 1:Ndates) {
# Extract the date interval between the model dates
# If it's the last date, then this is a flag:
# -- no model built, just predict on that last date.
#if(i == Ndates) interval <- (wfo_dates[i] + 1):N else
# interval <- (wfo_dates[i]+1):(wfo_dates[i + 1])
# Diagnostic to see which models predict which date intervals
#intervalstr <- str_c(paste(interval), collapse = ", ")
#sprint("model date = %s, interval = %s ", wfo_dates[i], intervalstr)
if(i == Ndates) {
# Use previous model to predict on last date, so interval = N
interval <- N
} else {
# Otherwise, must build a model, then predict forward. Set
# interval between two consecutive wfo_dates
interval <- (wfo_dates[i]+1):(wfo_dates[i + 1])
# subset the data for training
datatrain <- featuremat[(wfo_dates[i] - modelwindow + 1):wfo_dates[i], ]
# Train random forest model
sprint("Building model on: %s, Number of features: %s",
index(featuremat[wfo_dates[i],]), ncol(datatrain) - 1)
rf <- randomForest(y ~ ., data = datatrain, mtry = mtry, ntree = ntree,
importance = importance, na.action = na.action)
}
# Predict next interval with trained model
datapred <- featuremat[interval, -1] # remove y
ypred <- predict(rf, datapred)
res$ypred[interval - wfo_dates[1]] <- ypred
}
#---------------------------------------------------------
# Build the results list to return
#---------------------------------------------------------
res2 <- list(pred = res,
wfo_dates = index(featuremat[wfo_dates, ]))
#---------------------------------------------------------
# if jobname is specified, then wrap the results list in
# a one element list named by variable jobname.
#---------------------------------------------------------
if(is.na(jobname)) results <- res2 else {
results <- list(res2)
names(results) <- jobname
}
return(results)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.