required_packages <- c("mlr3verse", "data.table", "xts", "openxlsx", "glmnet") # Add your required packages here # Function to check and install the necessary packages install_and_load <- function(packages) { for (package in packages) { # Check if the package is installed if (!require(package, character.only = TRUE)) { # Install the package if it is not installed install.packages(package, dependencies = TRUE) # Load the package after installing library(package, character.only = TRUE) } } } install_and_load(required_packages)
# Load factors and return data source(file.path('..', 'inst', 'parsers', 'Value--Devil-in-HMLs-Details.R')) data("data_ml")
colnames(data_ml) <- toupper(colnames(data_ml)) data_ml$DATE <- as.yearmon(as.Date(data_ml$DATE, format = "%Y-%m-%d")) data_ml[,-2] <- apply(data_ml[,-2],2,as.numeric) data_ml <- as.data.table(data_ml) sample_stock <- data_ml[STOCK_ID == 17, !"STOCK_ID"] factor_data <- list(HML_FF = HML_Devil.HML_FF, HML_Dev = HML_Devil.HML.DEV, ME = HML_Devil.ME_1, MKT_EX = HML_Devil.MKT, RF = HML_Devil.RF, SMB = HML_Devil.SMB, UMD = HML_Devil.UMD) factor_vars <- c("HML_FF", "HML_Dev", "ME", "MKT_EX", "RF", "SMB", "UMD") nfactor <- length(factor_data) for (f in 1:nfactor) { if (ncol(factor_data[[f]]) > 1) { factor_data[[f]] <- factor_data[[f]][, 'USA'] } factor_data[[f]] <- data.table( "FACTOR" = factor_data[[f]], "DATE" = index(factor_data[[f]]) ) colnames(factor_data[[f]]) <- c(factor_vars[f], 'DATE') } factor_data <- Reduce(function(...) { merge(..., by='DATE', all=TRUE) }, factor_data) full_data <- merge(sample_stock, factor_data, by = 'DATE', all = TRUE) |> na.omit() |> as.data.table() full_data[, `:=` ( MKT = MKT_EX + RF, RETEX = R1M_USD - (MKT_EX + RF) )] full_data <- full_data[, !("RF")] factor_vars <- factor_vars[factor_vars != "RF"]
short_data <- full_data[, .SD, .SDcols = c("DATE", "RETEX", factor_vars)] cutoff_1 <- as.yearmon("2013-12") cutoff_2 <- as.yearmon("2015-12") training_data <- short_data[DATE <= cutoff_1, ] holdout_data <- short_data[((DATE > cutoff_1) & (DATE <= cutoff_2)), ] testing_data <- short_data[DATE > cutoff_2, ]
Prevailing Means Benchmark
pm_predictions <- numeric(nrow(testing_data)) pm_accuracies <- numeric(nrow(testing_data)) for (i in 1:nrow(testing_data)) { current_test_date <- testing_data[i, DATE] # Update training data to include all data up to the current test date updated_training_data <- short_data[DATE < current_test_date] # Calculate the benchmark prediction as the mean of previous RETEX values pm_predictions[i] <- mean(updated_training_data$RETEX) # Calculate accuracy (e.g., direction prediction accuracy) pm_accuracies[i] <- (pm_predictions[i] > 0) == (testing_data[i, RETEX] > 0) } pm_acc <- mean(pm_accuracies) # Calculate prediction errors for the benchmark model pm_errors <- testing_data$RETEX - pm_predictions # Sum of squared errors for the benchmark model ss_pm <- sum(pm_errors^2, na.rm = TRUE)
Combination Forecast
# Initialize vectors to store results c_predictions <- nrow(testing_data) c_accuracies <- nrow(testing_data) # Loop over each point in the testing data for (i in 1:nrow(testing_data)) { current_test_date <- testing_data[i, DATE] # Update training data to include all data up to the current test date updated_training_data <- short_data[DATE < current_test_date] # Initialize a vector to store individual model predictions individual_predictions <- numeric(length(factor_vars)) # Loop over each covariate to train univariate models and make predictions for (j in seq_along(factor_vars)) { covariate <- factor_vars[j] formula <- as.formula(paste("RETEX ~", covariate)) # Train the univariate OLS model ols_model <- lm(formula, data = updated_training_data) # Predict the current test point current_test_point <- as.data.table(testing_data[i, ..covariate]) individual_predictions[j] <- predict(ols_model, newdata = current_test_point) } # Combine predictions by averaging them c_predictions[i] <- mean(individual_predictions) # Calculate accuracy (e.g., direction prediction accuracy) c_accuracies[i] <- (c_predictions[i] > 0) == (testing_data[i, RETEX] > 0) } # Calculate overall accuracy for the combined predictions c_acc <- mean(c_accuracies) # Model evaluation # Calculate prediction errors for the combined model c_errors <- testing_data$RETEX - c_predictions # Sum of squared errors for the combined model ss_c <- sum(c_errors^2, na.rm = TRUE) # Calculate R^2_OS r2_c <- 1 - (ss_c / ss_pm)
Elastic Net Following the methodology of Rapach and Zhou 2019
enet_predictions <- nrow(testing_data) enet_accuracies <- nrow(testing_data) for (i in 1:nrow(testing_data)) { current_test_date <- as.yearmon(testing_data[i, "DATE"]) # Update training data to include all data up to the current test date updated_training_data <- short_data[DATE < current_test_date, !("DATE")] # Train the model on the updated training data lasso <- TaskRegr$new(id = "lasso", backend = updated_training_data, target = "RETEX") lrn_lasso <- lrn("regr.cv_glmnet", alpha = 0.5) lrn_lasso$train(lasso) fitted_lasso <- lrn_lasso$model lasso_lambda_min <- fitted_lasso$lambda.min # Predict the current test point current_test_point <- testing_data[i, !c("RETEX", "DATE")] lasso_test_predict <- predict(fitted_lasso, as.matrix(current_test_point), lasso_lambda_min) # Store the prediction enet_predictions[i] <- lasso_test_predict # Calculate accuracy (e.g., direction prediction accuracy) enet_accuracies[i] <- mean((lasso_test_predict > 0) == (testing_data[i, RETEX] > 0)) } enet_acc <- mean(enet_accuracies) # Model evaluation # Calculate prediction errors for the combined model enet_errors <- testing_data$RETEX - enet_predictions # Sum of squared errors for the combined model ss_enet <- sum(enet_errors^2, na.rm = TRUE) # Calculate R^2_OS r2_enet <- 1 - (ss_enet / ss_pm)
C-ENET
combined_holdout_data <- rbind(holdout_data, testing_data) # Initialize vectors to store results cenet_predictions <- numeric(nrow(testing_data)) cenet_accuracies <- numeric(nrow(testing_data)) # Step 1: Loop over each point in the holdout and testing data for univariate OLS predictions univariate_forecasts <- matrix(NA, nrow = nrow(combined_holdout_data), ncol = length(factor_vars)) for (i in 1:nrow(combined_holdout_data)) { current_test_date <- combined_holdout_data[i, DATE] # Update training data to include all data up to the day before the current test date updated_training_data <- short_data[DATE < current_test_date] # Compute recursive univariate predictive regression forecasts for (j in seq_along(factor_vars)) { covariate <- factor_vars[j] formula <- as.formula(paste("RETEX ~", covariate)) # Train the univariate OLS model ols_model <- lm(formula, data = updated_training_data) # Predict the current test point current_test_point <- as.data.frame(combined_holdout_data[i, ..covariate]) univariate_forecasts[i, j] <- predict(ols_model, newdata = current_test_point) } } # Step 2: Loop over each point in the testing data for ENet training and combined predictions for (i in 1:nrow(testing_data)) { holdout_index <- nrow(holdout_data) + i if (holdout_index > 1) { X <- univariate_forecasts[1:(holdout_index - 1), , drop = FALSE] y <- combined_holdout_data$RETEX[1:(holdout_index - 1)] # Create a data.table for mlr3 enet_data <- as.data.table(cbind(X, y = y)) # Define the task and learner for Elastic Net task <- TaskRegr$new("elastic_net", backend = enet_data, target = "y") learner <- lrn("regr.cv_glmnet", alpha = 0.000005) # Train the Elastic Net model learner$train(task) enet_coefs <- coef(learner$model, s="lambda.min")[-1] # Identify selected predictors selected_indices <- which(enet_coefs != 0) if (length(selected_indices) > 0) { selected_forecasts <- univariate_forecasts[holdout_index, selected_indices] # Compute the C-ENet forecast cenet_predictions[i] <- mean(selected_forecasts) # Calculate accuracy (e.g., direction prediction accuracy) cenet_accuracies[i] <- (cenet_predictions[i] > 0) == (testing_data[i, RETEX] > 0) } } } # Calculate overall accuracy for the combined predictions cenet_acc <- mean(cenet_accuracies, na.rm = TRUE) # Model evaluation # Calculate prediction errors for the combined model cenet_errors <- testing_data$RETEX - cenet_predictions # Sum of squared errors for the combined model ss_cenet <- sum(cenet_errors^2, na.rm = TRUE) # Calculate R^2_OS r2_cenet <- 1 - (ss_cenet / ss_pm)
Tree
rpart_predictions <- NULL rpart_accuracies <- NULL for (i in 1:nrow(testing_data)) { current_test_date <- testing_data[i, DATE] # Update training data to include all data up to the day before the current test date updated_training_data <- short_data[DATE < current_test_date, !("DATE")] # Define the task and learner for the decision tree task <- TaskRegr$new(id = "rpart", backend = updated_training_data, target = "RETEX") learner <- lrn("regr.rpart") # Train the decision tree model learner$train(task) # Predict the current test point current_test_point <- testing_data[i, !c("DATE")] print(learner$predict_newdata(current_test_point)[["response"]]) print(learner$predict_newdata(current_test_point)) rpart_predictions[i] <- learner$predict_newdata(current_test_point)[["response"]] rpart_accuracies[i] <- mean((rpart_predictions[i] > 0) == (testing_data[i, RETEX] > 0)) } rpart_acc <- mean(rpart_accuracies) # Model evaluation # Calculate prediction errors for the combined model rpart_errors <- testing_data$RETEX - rpart_predictions # Sum of squared errors for the combined model ss_rpart <- sum(rpart_errors^2, na.rm = TRUE) # Calculate R^2_OS r2_rpart <- 1 - (ss_rpart / ss_pm)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.