knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.path = "man/figures/README-",
  out.width = "100%"
)

torchtabular

DOI

A package for training transformer models on tabular datasets, using SAINT and TabTransformer variant models in R using {torch}.

Installation

You can install torchtabular from GitHub with:

# install.packages("devtools")
devtools::install_github("cmcmaster1/torchtabular")

Example

library(torchtabular)
library(tidymodels)
library(tidyverse)
library(torch)
library(luz)
library(madgrad)

Set seeds

torch_manual_seed(seed = 154)
set.seed(154)

Check for GPU and assign device

device <- ifelse(cuda_is_available(), 'cuda', 'cpu')

Load data

The blastchar dataset is included.

data('blastchar')
glimpse(blastchar)

Prepare data

First we will convert the target variable into an integer (0 and 1), and convert characters to factors so that our tabular dataset will identify them correctly.

blastchar <- blastchar %>%
  select(-customerID) %>% 
  mutate(across(c(where(is.character), SeniorCitizen), as_factor),
         Churn = as.numeric(Churn) - 1)

glimpse(blastchar)

We can now split the data into train and test sets.

split <- initial_split(blastchar)
train <- training(split)
valid <- testing(split)

By creating a recipe, the tabular_dataset function will automatically recognise categorical (must be factors) and continuous predictors.

recipe <- recipe(blastchar, Churn ~ .) %>%
  step_scale(all_numeric_predictors()) %>%
  step_integer(all_nominal_predictors()) %>% 
  step_impute_linear(all_predictors())

We can then pass this recipe to tabular_dataset with the relevant split.

train_dset <- tabular_dataset(recipe, train)
valid_dset <- tabular_dataset(recipe, valid)

Finally, we make a dataloader.

train_dl <- dataloader(train_dset,
                       batch_size = 64,
                       shuffle = TRUE)

valid_dl <- dataloader(valid_dset,
                       batch_size = 1024,
                       shuffle = FALSE)

Now we define our model:

n_epochs <- 5

model_setup <- tabtransformer %>%
  setup(
    loss = nn_bce_with_logits_loss(),
    optimizer = madgrad::optim_madgrad,
    metrics = list(
      luz_metric_binary_auroc(from_logits = TRUE),
      luz_metric_binary_accuracy_with_logits()
    )
  ) %>%
  set_hparams(categories = train_dset$categories,
              num_continuous = train_dset$num_continuous,
              dim_out = 1,
              attention = "both",
              attention_type = "signed",
              is_first = TRUE,
              dim = 32,
              depth = 1,
              heads_selfattn = 32,
              heads_intersample = 32,
              dim_heads_selfattn = 16,
              dim_heads_intersample = 64,
              attn_dropout = 0.1,
              ff_dropout = 0.8,
              embedding_dropout = 0.0,
              mlp_dropout = 0.0,
              mlp_hidden_mult = c(4, 2),
              softmax_mod = 1.0,
              is_softmax_mod = 1.0,
              skip = FALSE,
              device = device) %>% 
  set_opt_hparams(lr = 2e-3) 

And train...

fitted <- model_setup %>% 
  fit(train_dl,
      epochs = n_epochs,
      valid_data = valid_dl,
      verbose = TRUE)

We test on a large batch to improve performance:

full_dset <- tabular_dataset(recipe, bind_rows(valid, train))
predict_bs <- 4000
preds <- predict(fitted, 
                 full_dset, 
                 dataloader_options = list(batch_size = predict_bs))$squeeze(-1)

preds <- as_array(preds)[1:nrow(valid)]
truth <- as_factor(ifelse(valid$Churn == 1, "Yes", "No"))

roc_auc_vec(truth = truth, estimate = preds, event_level = "second")


cmcmaster1/torchtabular documentation built on Dec. 19, 2021, 5:17 p.m.