This example is an adaptation of the 'Training a causal language model from scratch' class from the Hugging Face NLP course.
library(torch) library(tok) library(luz) library(minhub) # remotes::install_github("mlverse/minhub") #library(tidyverse) options(arrow.skip_nul = TRUE) library(arrow)
First step is to implement a torch dataset that gathers data and pre-process it into a format that is suitable for training the model.
That means that we need to:
We are going to use 2 datasets available in Hugging Face Hub. The first contain all R packages source code available on CRAN. The second contains all R code that is available in GitHub data dumps. Both datasets are in the Parquet format. Following we implement a function that downloads and caches the data and then returns a single arrow table containing all data.
read_dataset <- function(source) { d <- source |> hfhub::hub_snapshot(repo_type = "dataset", allow_patterns = "parquet$") |> fs::path("data/r") |> arrow::open_dataset() |> dplyr::filter(stringr::str_detect(path, ".*\\.[rR]$")) |> dplyr::select(content) |> dplyr::mutate(content = arrow::cast(content, arrow::string())) |> dplyr::filter(!is.na(content)) |> dplyr::collect() %>% # the dataset contains invalid utf8 characters... # we need to remove them, otherwise we get an error from tokenizers dplyr::filter(utf8::utf8_valid(content)) } read_datasets <- function() { dplyr::bind_rows( read_dataset("dfalbel/cran-packages"), read_dataset("dfalbel/github-r-repos") ) }
Next we implement a function that trains a tokenizer for our dataset.
create_tokenizer <- function(text, vocab_size, special_tokens) { tok <- tok::tokenizer$new(tok::model_bpe$new()) tok$pre_tokenizer <- tok::pre_tokenizer_byte_level$new(add_prefix_space = FALSE) tok$decoder <- tok::decoder_byte_level$new() tok$post_processor <- tok::processor_byte_level$new(trim_offsets = FALSE) tok$train_from_memory( text, tok::trainer_bpe$new(vocab_size = vocab_size, special_tokens = special_tokens) ) tok } # test code to debug the tokenizer # data <- read_datasets() # tok <- create_tokenizer(data$content)
We can finally implement the torch dataset that we are going to use for training
the model. We are going to use the torch::iterable_dataset
instead of torch::dataset
.
The main motivation is that we can't really know the total number of samples in
the dataset, so we can implement a .getitem()
method to get any arbiratrary sample.
Thus we implement the .iter
method that returns a new sample every time it's called.
r_sources_dataset <- torch::iterable_dataset( "r_sources_dataset", initialize = function(root = ".", vocab_size = 20000, context_length = 128) { self$data <- read_datasets() self$context_length <- context_length self$index <- sample.int(nrow(self$data)) # we only create a tokenizer if it doesn't exist, otherwise we just load it tok_path <- file.path(root, glue::glue("tokenizer-{vocab_size}.json")) if (!file.exists(tok_path)) { self$tok <- create_tokenizer( as.character(self$data$content), vocab_size, c("<fbegin>", "<fend>") ) fs::dir_create(root) self$tok$save(tok_path) } else { self$tok <- tok::tokenizer$from_file(tok_path) } }, .iter = function() { i <- 1L sequence <- c() function() { while (length(sequence) < (self$context_length + 1) && i <= nrow(self$data)) { sequence <<- c( sequence, self$tok$encode(paste("<fbegin>", as.character(self$data$content[self$index[i]]), "<fend>"))$ids ) i <- i + 1L } if (length(sequence) < (self$context_length + 1)) { return(coro::exhausted()) } on.exit({ sequence <<- sequence[-seq_len(self$context_length)] }) list( input_ids = sequence[seq_len(self$context_length)] + 1L, labels = sequence[2:(self$context_length + 1)] + 1L ) } } ) # debug code for the dataset # ds <- r_sources_dataset("~/Downloads/") # it <- ds$.iter() # it() # ds$tok$get_vocab_size()
This dataset is likely too large for us to train the model on all documents in this example. It's also hard to predict how long it will take for it to train until the end. In order to make it easier, we define a wraper dataset that is used to run the above dataset for a fixed number of steps. This is not required, but makes using luz more pleasant, as we can easily define for how many tokens we want to train our model.
fixed_steps_iterable_dataset <- iterable_dataset( "fixed_steps_dataset", initialize = function(dataset, steps) { self$dataset <- dataset self$steps <- steps }, .iter = function() { i <- 1L iter <- NULL function() { if (i > self$steps) { return(coro::exhausted()) } i <<- i + 1L if (is.null(iter) || coro::is_exhausted(data <- iter())) { iter <<- self$dataset$.iter() data <- iter() } data } }, .length = function() { self$steps } )
We finally define the model we are going to train. We'll use a small version of
GPT2.
We also define a generate
method allowing us to sample from the model given an initial
context.
net <- nn_module( initialize = function() { self$gpt <- minhub::gpt2( vocab_size = 20000, pdrop = 0.1 ) }, forward = function(x) { self$gpt(x)$transpose(2,3) }, generate = function(x, temperature = 1, iter = 50, top_k = 10) { # samples from the model givn a context vector. for (i in seq_len(iter)) { logits <- self$forward(x)[,,-1] logits <- logits/temperature c(prob, ind) %<-% logits$topk(top_k) logits <- torch_full_like(logits, -Inf)$scatter_(-1, ind, prob) logits <- nnf_softmax(logits, dim = -1) id_next <- torch_multinomial(logits, num_samples = 1) x <- torch_cat(list(x, id_next), dim = 2) } x } ) # debug code for the model # ds <- torch::dataloader(r_sources_dataset("~/Downloads/"), batch_size = 32) # batch <- coro::collect(ds, 1)[[1]] # str(batch) # m <- net() # str(m(batch$input_ids))
To make it easier to inspect training, we will also define a callback that prints a sample from the model every epoch.
# samples from the model using the context. generate <- function(model, tok, context, ...) { local_no_grad() # disables gradient for sampling x <- tok$encode(context)$ids + 1L x <- torch_tensor(x)[NULL,]$to(device = model$device) content <- as.integer(model$generate(x, ...)$cpu()) tok$decode(content - 1L) } display_cb <- luz_callback( initialize = function() {}, on_epoch_end = function() { local_no_grad() # sample from the model... context <- "# creates a linear model" text <- generate(ctx$model, dataset$dataset$tok, context, iter = 100) cli::cli_rule() cat(text, "\n") cli::cli_rule() } )
We can finally train the model. We define that we want to train the model for half a billion tokens in a total of 100 epochs.
n_tokens <- 500e6 batch_size <- 16 epochs <- 100 context_length <- 256L steps <- n_tokens / context_length / epochs dataset <- fixed_steps_iterable_dataset( r_sources_dataset(context_length = context_length), steps = steps ) fitted <- net %>% setup( optimizer = optim_adam, loss = nn_cross_entropy_loss() ) %>% set_opt_hparams(lr = 3e-4) |> fit( dataset, epochs = epochs, dataloader_options = list(batch_size = batch_size), callbacks = list( luz_callback_lr_scheduler( torch::lr_one_cycle, max_lr = 0.1, epochs = epochs, steps_per_epoch = steps/batch_size, call_on = "on_batch_end" ), luz_callback_gradient_clip(max_norm = 1), display_cb() ), verbose = TRUE ) luz::luz_save(fitted, "model.pt")
We can then use the model to generate text given a prompt with:
fitted <- luz::luz_load("model.pt") tok <- tok::tokenizer$from_file("tokenizer-20000.json") context <- "#' Creates a linear model linear_model <- function(x, y) { " text <- generate(fitted$model, tok, context, iter = 100) cat(text)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.