# Build and train deep neural nets.
# https://keras.rstudio.com/index.html
library(keras)

# Custom data preprocessing procedures.
# https://tidymodels.github.io/recipes/
library(recipes)

# Data resampling. We will use it to split the customer churn dataset
# into training and test sets for our deep learning models.
# https://tidymodels.github.io/rsample
library(rsample)

# Multiple packages that support clean code and tidy data.
# https://tidyverse.tidyverse.org/
library(tidyverse)

# Tidy methods to measure model performance.
# We will use it to compute accuracy on the testing set.
# https://tidymodels.github.io/yardstick
library(yardstick)

library(tensorflow)

library(tidyverse)
library(vroom)

nmr_round <- params$nmr_round
nmr_training_metadata <- vroom(paste0("numerai_datasets/", nmr_round, "/nmr_training_metadata"))

Introduction

The purpose of this project is to discover targets package. To do so, we will follow the targets tuto, and try to implement similar functions but on a different dataset: Numerai dataset.

split_data()

For machine learning, we need to split the financial assets (rows) into a training dataset and a testing dataset.

split_data <- function(vroom_file) {
  vroom::vroom(vroom_file, col_select = c(-data_type, -era)) %>%
    initial_time_split(prop = nmr_training_metadata %>% filter(data_type == "train") %>% pull(prop)) # from the rsample package
}

Try out the function.

nmr_data <- split_data(paste0("numerai_datasets/", nmr_round, "/nmr_data"))

The training set has 501808 financial assets (rows) and the testing set has 137779

print(nmr_data)

Functions from rsample can recover the training and testing sets.

The dataset has 312 variables (columns).

glimpse(training(nmr_data))
glimpse(testing(nmr_data))

target is our response variable, and id identifies the financial asset.

training(nmr_data) %>%
  select(id, target)

The rest of the variables are covariates: the features describe the various quantitative attributes of the stock at the time

prepare_recipe()

prepare_recipe() gets the data ready for the models. It accepts a dataset with a train/test split and returns a recipe object generated by the recipes package.

prepare_recipe <- function(nmr_data) {
  nmr_data %>%
    # Just preprocess the training data.
    training() %>%
    # Start defining a new recipe.
    recipe(target ~ .) %>%
    # Remove the customerID variable from the data.
    step_rm(id) %>%
    # Run the recipe on the data.
    prep()
}

Let's try out the function to make sure it works.

nmr_recipe <- prepare_recipe(nmr_data)
print(nmr_recipe)

Later on, we will need to retrieve the preprocessed training data with juice().

juice(nmr_recipe, all_outcomes())
juice(nmr_recipe, all_predictors())

Keras will want our predictors to be in matrix form.

juice(nmr_recipe, all_predictors(), composition = "matrix")[1:6, 1:4]

When we compute accuracy later on, we will use bake() to preprocess the testing data.

bake(nmr_recipe, testing(nmr_data))

define_model()

Before we fit a model, we need to define it. define_model() function encapsulates our Keras model definition. It serves as custom shorthand that will make our other functions easier to read.

define_model <- function(nmr_recipe, units1, units2, act1, act2, act3) {
  input_shape <- ncol(
    juice(nmr_recipe, all_predictors(), composition = "matrix")
  )
  out <- keras_model_sequential() %>%
    layer_dense(
      units = units1,
      kernel_initializer = "uniform",
      activation = act1,
      input_shape = input_shape
    ) %>%
    layer_dropout(rate = 0.1) %>%
    layer_dense(
      units = units2,
      kernel_initializer = "uniform",
      activation = act2
    ) %>%
    layer_dropout(rate = 0.1) %>%
    layer_dense(
      units = 1,
      kernel_initializer = "uniform",
      activation = act3
    )
  out
}

Let's check if it returns the model definition we expect.

define_model(nmr_recipe, 16, 16, "relu", "relu", "sigmoid") %>%
  print()

train_model()

Next, we need to fit a model and return the fitted model object.

train_model <- function(
  nmr_recipe,
  units1 = 16,
  units2 = 16,
  act1 = "relu",
  act2 = "relu",
  act3 = "sigmoid"
) {
  model <- define_model(nmr_recipe, units1, units2, act1, act2, act3)
  compile(
    model,
    optimizer = "adam",
    loss = "categorical_crossentropy",
    metrics = c("accuracy")
  )
  x_train_tbl <- juice(
    nmr_recipe,
    all_predictors(),
    composition = "matrix"
  )
  y_train_vec <- juice(nmr_recipe, all_outcomes()) %>%
    pull()
  fit(
    object = model,
    x = x_train_tbl,
    y = y_train_vec,
    batch_size = 32,
    epochs = 32,
    validation_split = nmr_training_metadata %>% filter(data_type == "train") %>% pull(prop),
    verbose = 0
  )
  model
}

Try it out.

model <- train_model(nmr_recipe)

print(model)

test_accuracy()

This function takes model object from train_model() and computes the accuracy on the testing data.

test_accuracy <- function(nmr_data, nmr_recipe, nmr_model) {
  testing_data <- bake(nmr_recipe, testing(nmr_data))
  x_test_tbl <- testing_data %>%
    select(-target) %>%
    as.matrix()
  y_test_vec <- testing_data %>%
    select(target) %>%
    pull()
  yhat_keras_class_vec <- nmr_model %>%
    predict_classes(x_test_tbl) %>%
    as.factor() %>%
    fct_recode(yes = "1", no = "0")
  yhat_keras_prob_vec <-
    nmr_model %>%
    predict_proba(x_test_tbl) %>%
    as.vector()
  test_truth <- y_test_vec %>%
    as.factor() %>%
    fct_recode(yes = "1", no = "0")
  estimates_keras_tbl <- tibble(
    truth = test_truth,
    estimate = yhat_keras_class_vec,
    class_prob = yhat_keras_prob_vec
  )
  estimates_keras_tbl %>%
    conf_mat(truth, estimate) %>%
    summary() %>%
    filter(.metric == "accuracy") %>%
    pull(.estimate)
}

Try it out.

test_accuracy(nmr_data, nmr_recipe, model)


kenshuri/targets-tuto documentation built on April 19, 2021, 9:58 a.m.