# Build and train deep neural nets. # https://keras.rstudio.com/index.html library(keras) # Custom data preprocessing procedures. # https://tidymodels.github.io/recipes/ library(recipes) # Data resampling. We will use it to split the customer churn dataset # into training and test sets for our deep learning models. # https://tidymodels.github.io/rsample library(rsample) # Multiple packages that support clean code and tidy data. # https://tidyverse.tidyverse.org/ library(tidyverse) # Tidy methods to measure model performance. # We will use it to compute accuracy on the testing set. # https://tidymodels.github.io/yardstick library(yardstick) library(tensorflow) library(tidyverse) library(vroom) nmr_round <- params$nmr_round nmr_training_metadata <- vroom(paste0("numerai_datasets/", nmr_round, "/nmr_training_metadata"))
The purpose of this project is to discover targets package. To do so, we will follow the targets tuto, and try to implement similar functions but on a different dataset: Numerai dataset.
split_data()
For machine learning, we need to split the financial assets (rows) into a training dataset and a testing dataset.
split_data <- function(vroom_file) { vroom::vroom(vroom_file, col_select = c(-data_type, -era)) %>% initial_time_split(prop = nmr_training_metadata %>% filter(data_type == "train") %>% pull(prop)) # from the rsample package }
Try out the function.
nmr_data <- split_data(paste0("numerai_datasets/", nmr_round, "/nmr_data"))
The training set has 501808 financial assets (rows) and the testing set has 137779
print(nmr_data)
Functions from rsample
can recover the training and testing sets.
The dataset has 312 variables (columns).
glimpse(training(nmr_data))
glimpse(testing(nmr_data))
target
is our response variable, and id
identifies the financial asset.
training(nmr_data) %>% select(id, target)
The rest of the variables are covariates: the features describe the various quantitative attributes of the stock at the time
prepare_recipe()
prepare_recipe()
gets the data ready for the models. It accepts a dataset with a train/test split and returns a recipe object generated by the recipes
package.
prepare_recipe <- function(nmr_data) { nmr_data %>% # Just preprocess the training data. training() %>% # Start defining a new recipe. recipe(target ~ .) %>% # Remove the customerID variable from the data. step_rm(id) %>% # Run the recipe on the data. prep() }
Let's try out the function to make sure it works.
nmr_recipe <- prepare_recipe(nmr_data) print(nmr_recipe)
Later on, we will need to retrieve the preprocessed training data with juice()
.
juice(nmr_recipe, all_outcomes())
juice(nmr_recipe, all_predictors())
Keras will want our predictors to be in matrix form.
juice(nmr_recipe, all_predictors(), composition = "matrix")[1:6, 1:4]
When we compute accuracy later on, we will use bake()
to preprocess the testing data.
bake(nmr_recipe, testing(nmr_data))
define_model()
Before we fit a model, we need to define it. define_model()
function encapsulates our Keras model definition. It serves as custom shorthand that will make our other functions easier to read.
define_model <- function(nmr_recipe, units1, units2, act1, act2, act3) { input_shape <- ncol( juice(nmr_recipe, all_predictors(), composition = "matrix") ) out <- keras_model_sequential() %>% layer_dense( units = units1, kernel_initializer = "uniform", activation = act1, input_shape = input_shape ) %>% layer_dropout(rate = 0.1) %>% layer_dense( units = units2, kernel_initializer = "uniform", activation = act2 ) %>% layer_dropout(rate = 0.1) %>% layer_dense( units = 1, kernel_initializer = "uniform", activation = act3 ) out }
Let's check if it returns the model definition we expect.
define_model(nmr_recipe, 16, 16, "relu", "relu", "sigmoid") %>% print()
train_model()
Next, we need to fit a model and return the fitted model object.
train_model <- function( nmr_recipe, units1 = 16, units2 = 16, act1 = "relu", act2 = "relu", act3 = "sigmoid" ) { model <- define_model(nmr_recipe, units1, units2, act1, act2, act3) compile( model, optimizer = "adam", loss = "categorical_crossentropy", metrics = c("accuracy") ) x_train_tbl <- juice( nmr_recipe, all_predictors(), composition = "matrix" ) y_train_vec <- juice(nmr_recipe, all_outcomes()) %>% pull() fit( object = model, x = x_train_tbl, y = y_train_vec, batch_size = 32, epochs = 32, validation_split = nmr_training_metadata %>% filter(data_type == "train") %>% pull(prop), verbose = 0 ) model }
Try it out.
model <- train_model(nmr_recipe) print(model)
test_accuracy()
This function takes model object from train_model()
and computes the accuracy on the testing data.
test_accuracy <- function(nmr_data, nmr_recipe, nmr_model) { testing_data <- bake(nmr_recipe, testing(nmr_data)) x_test_tbl <- testing_data %>% select(-target) %>% as.matrix() y_test_vec <- testing_data %>% select(target) %>% pull() yhat_keras_class_vec <- nmr_model %>% predict_classes(x_test_tbl) %>% as.factor() %>% fct_recode(yes = "1", no = "0") yhat_keras_prob_vec <- nmr_model %>% predict_proba(x_test_tbl) %>% as.vector() test_truth <- y_test_vec %>% as.factor() %>% fct_recode(yes = "1", no = "0") estimates_keras_tbl <- tibble( truth = test_truth, estimate = yhat_keras_class_vec, class_prob = yhat_keras_prob_vec ) estimates_keras_tbl %>% conf_mat(truth, estimate) %>% summary() %>% filter(.metric == "accuracy") %>% pull(.estimate) }
Try it out.
test_accuracy(nmr_data, nmr_recipe, model)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.