if (!reticulate::py_module_available("tensorflow")) { knitr::opts_chunk$set(eval = FALSE) } else { knitr::opts_chunk$set(eval = TRUE) }
library(deepG) library(magrittr) library(keras)
options(rmarkdown.html_vignette.check_title = FALSE)
```{css, echo=FALSE} mark.in { background-color: CornflowerBlue; }
mark.out { background-color: IndianRed; }
The deepG library offers several options to extract input/target pairs from data. We can differentiate between to main approach: * **Language model:** predict a character or several characters in a sequence. * **Label Classification:** map a label to a sequence. ## Language model With language model, we mean a model that predicts a character in a sequence. We have several options to determine the output format of the data generator using the `output_format` argument. The `output_format` determines the shape of the output for a language model, i.e. part of a sequence is the input $X$ and another the target $Y$. Assume a sequence <tt>abcdefg</tt> and `maxlen = 6`. Output correspond as follows **"target_right"**: $X=$ <tt>abcdef</tt>, $Y=$ <tt>g</tt> **"target_middle_lstm"**: $X =$ ($X_1 =$ <tt>abc</tt>, $X_2 =$ <tt>gfe</tt>), $Y=$ <tt>d</tt> (note reversed order of $X_2$) **"target_middle_cnn"**: $X =$ <tt>abcefg</tt>, $Y =$ <tt>d</tt> **"wavenet"**: $X =$ <tt>abcdef</tt>, $Y =$ <tt>bcdefg</tt> ### Create dummy data To test the different language model options, we create a simple dummy data set consisting of a repetition of the sequence <tt>AAACCCGGGTTTAAACCC...</tt>. ```r vocabulary <- c("A", "C", "G", "T") base_seq <- "AAACCCGGGTTT" full_seq <- strrep(base_seq, 50) df <- data.frame(Header = "header", Sequence = full_seq) # create training fasta file train_dir <- tempfile() dir.create(train_dir) microseq::writeFasta(df, file.path(train_dir, "train_1.fasta")) # create validation fasta file (use same data as training) val_dir <- tempfile() dir.create(val_dir) microseq::writeFasta(df, file.path(val_dir, "val_1.fasta"))
Say we want to predict the next character in a sequence given the last 5 characters and our text consists of the letters A,C,G,T . First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions.
model <- create_model_lstm_cnn( maxlen = 5, layer_lstm = c(8), layer_dense = c(4), learning_rate = 0.1, vocabulary_size = 4 # text consists of A,C,G,T )
Next we have to specify the location of our training and validation data and the output format of the data generator
hist <- train_model(train_type = "lm", # running a language model output_format = "target_right", # predict target at end of sequence model = model, path = train_dir, path_val = val_dir, steps_per_epoch = 5, # use 5 batches per epoch train_val_ratio = 0.2, # use 20% of samples for validation compared to train batch_size = 16, epochs = 4) plot(hist)
If we want to predict a character in the middle of a sequence and use LSTM layers, we should split our input into two layers. One layer handles the sequence before and one the input after the target. If, for example
sequence:
ACCGTGGAA
then first input corresponds to ACCG and second to AAGG. We may create a model with two input layers using the create_model_cnn_lstm_target_middle
model <- create_model_lstm_cnn_target_middle( maxlen = 5, layer_lstm = c(8), layer_dense = c(4), learning_rate = 0.1, vocabulary_size = 4 )
The train_model
call is identical to the previous model, except we have to change the output format of the generator by setting output_format = "target_middle_lstm"
. This reverses the order of the sequence after the target.
hist <- train_model(train_type = "lm", # running a language model output_format = "target_middle_lstm", # predict target in middle of sequence model = model, path = train_dir, path_val = val_dir, steps_per_epoch = 5, # use 5 batches per epoch train_val_ratio = 0.2, # use 20% of samples for validation compared to train batch_size = 16, epochs = 4) plot(hist)
Here we mask some parts of the input sequence and the model tries to predict the masked regions. Can be used for training BERT-like models. See also this notebook. We can first check how the generator works.
# create dummy training data nt_seq <- rep(c("A", "C", "G", "T"), each = 25) %>% paste(collapse = "") %>% strrep(10) df <- data.frame(Sequence = nt_seq, Header = "seq_1") fasta_path <- tempfile(fileext = ".fasta") fasta_file <- microseq::writeFasta(df, fasta_path) masked_lm <- list(mask_rate = 0.10, # replace 10% of input with special mask token random_rate = 0.03, # set 3% of input to random value identity_rate = 0.02, # leave 2% unchanged (and set sample weight to 1) include_sw = TRUE) # 0,1 matrix showing where masking was applied gen <- get_generator(path = fasta_path, train_type = "masked_lm", masked_lm = masked_lm, batch_size = 1, n_gram = 1, n_gram_stride = 1, return_int = TRUE, maxlen = 100, vocabulary = c("A", "C", "G", "T")) z <- gen() x <- z[[1]] y <- z[[2]] sw <- z[[3]] df <- data.frame(x = x[1, ], y = y[1, ], sw = sw[1, ]) print(head(df), 10) print(df[ df$sw == 1, ])
Create the model architecture.
model <- create_model_transformer( maxlen = 100, vocabulary_size = 6, embed_dim = 16, ff_dim = 32, pos_encoding = "embedding", head_size = 20, num_heads = 4, layer_dense = 6, flatten_method = "none", last_layer_activation = "softmax", loss_fn = "sparse_categorical_crossentropy", solver = "adam", learning_rate = 0.005 )
Train the model.
batch_size <- 128 masked_lm <- list(mask_rate = 0.10, random_rate = 0.03, identity_rate = 0.02, include_sw = TRUE) hist <- train_model(model = model, # training args run_name = "bert_1", epochs = 8, steps_per_epoch = 75, # generator args maxlen = 100, train_type = "masked_lm", path = fasta_path, path_val = NULL, batch_size = batch_size, step = 25, masked_lm = masked_lm, proportion_per_seq = 0.97, return_int = TRUE) plot(hist)
Evaluate the trained model.
gen <- get_generator(path = fasta_path, train_type = "masked_lm", masked_lm = masked_lm, batch_size = 1, n_gram = 1, n_gram_stride = 1, return_int = TRUE, maxlen = 100, vocabulary = c("A", "C", "G", "T")) z <- gen() x <- z[[1]] y <- z[[2]] sw <- z[[3]] pred <- model$predict(x) pred <- apply(pred[1,,], 1, which.max) - 1 df <- data.frame(x = x[1, ], sw = sw[1, ], y = y[1, ], pred = pred) head(df) df[df$sw == 1, ]
df2 <- df[df$sw == 1, ] table(df2$pred, df2$y)
With label classification, we describe the task of mapping a label to a sequence. For example: given the input ACGACCG, does the sequence belong to a viral or bacterial genome?
deepG offers three options to map a label to a sequence
the label gets read from the fasta header
files from every class are in separate folders
get label from csv file
To test label classification, we create a simple dummy data set. One class consists of random sequences using just A and C and second class uses just G and T.
# create training fasta files train_dir_1 <- tempfile() train_dir_2 <- tempfile() dir.create(train_dir_1) dir.create(train_dir_2) train_dir <- list(train_dir_1, train_dir_2) for (i in 1:2) { if (i == 1) { vocabulary <- c("A", "C") header <- "label_1" fasta_name_start <- "label_1_train_file" } else { vocabulary <- c("G", "T") header <- "label_2" fasta_name_start <- "label_2_train_file" } create_dummy_data(file_path = train_dir[[i]], num_files = 3, seq_length = 20, num_seq = 5, header = header, fasta_name_start = fasta_name_start, vocabulary = vocabulary) } # create validation fasta files val_dir_1 <- tempfile() val_dir_2 <- tempfile() dir.create(val_dir_1) dir.create(val_dir_2) val_dir <- list(val_dir_1, val_dir_2) for (i in 1:2) { if (i == 1) { vocabulary <- c("A", "C") header <- "label_1" fasta_name_start <- "label_1_val_file" } else { vocabulary <- c("G", "T") header <- "label_2" fasta_name_start <- "label_2_val_file" } create_dummy_data(file_path = val_dir[[i]], num_files = 3, seq_length = 20, num_seq = 5, header = header, fasta_name_start = fasta_name_start, vocabulary = vocabulary) }
In this approach, we put all data from one class into a separate folder. Say we want to classify if a sequence belongs to a viral or bacterial genome. We may put all virus and bacteria files into their own folder. In this case the path
and path_val
arguments should be vectors, where each entry is the path to one class.
First we have to create a model. We may use a model with 1 LSTM and 1 dense layer for predictions. An input sequence has length 5.
model <- create_model_lstm_cnn( maxlen = 5, layer_lstm = c(8), learning_rate = 0.1, layer_dense = c(2), # binary classification vocabulary_size = 4 # text consists of A,C,G,T )
train_model(train_type = "label_folder", # reading label from folder model = model, path = c(train_dir_1, # note that path has two entries train_dir_2), path_val = c(val_dir_1, val_dir_2), steps_per_epoch = 5, # use 5 batches per epoch train_val_ratio = 0.2, batch_size = 8, epochs = 2, vocabulary_label = c("label_1", "label_2") # names of classes )
The fasta headers in our dummy data have the names "label_1" or "label_2"
files <- list.files(train_dir_1, full.names = TRUE) fasta_file <- microseq::readFasta(files[1]) head(fasta_file)
train_model(train_type = "label_header", # reading label from fasta header model = model, path = train_dir, path_val = val_dir, steps_per_epoch = 5, train_val_ratio = 0.2, batch_size = 8, epochs = 2, vocabulary_label = c("label_1", "label_2") # names of labels )
In this approach we extract the sequence label by mapping the current file name to a csv table.
files_1 <- basename(list.files(c(train_dir_1, val_dir_1))) files_2 <- basename(list.files(c(train_dir_2, val_dir_2))) file <- c(files_1, files_2) label_1 <- stringr::str_detect(file, "label_1") %>% as.integer() label_2 <- stringr::str_detect(file, "label_2") %>% as.integer() df <- data.frame(file, label_1, label_2) df csv_path <- tempfile(fileext = ".csv") write.csv(df, csv_path, row.names = FALSE) hist <- train_model(train_type = "label_csv", target_from_csv = csv_path, model = model, path = train_dir, path_val = val_dir, steps_per_epoch = 5, train_val_ratio = 0.2, batch_size = 8, epochs = 2) plot(hist)
We can also use rds files files as input, where the data must already be preprocessed.
We may use the dataset_from_gen
function to create rds files from fasta files.
rds_folder_train <- tempfile() rds_folder_val <- tempfile() dir.create(rds_folder_train) dir.create(rds_folder_val) for (data_type in c("train", "val")) { if (data_type == "train") { output_path <- rds_folder_train path_corpus <- train_dir } else { output_path <- rds_folder_val path_corpus <- val_dir } dataset_from_gen(output_path = output_path, iterations = 25, # create 25 rds files train_type = "label_folder", path_corpus = path_corpus, batch_size = 128, maxlen = 5, step = 5, vocabulary = c("a", "c", "g", "t"), file_name_start = "batch_") }
We created 25 files for training and validation with preprocessed data.
train_files <- list.files(rds_folder_train, full.names = TRUE) basename(train_files) example_batch <- readRDS(train_files[1]) x <- example_batch[[1]] y <- example_batch[[2]] dim(x) dim(y) x[1,,] y[1,]
We can now use these files for training.
model <- create_model_lstm_cnn( maxlen = 5, layer_lstm = c(8), learning_rate = 0.1, layer_dense = c(2)) hist <- train_model(train_type = "label_rds", model = model, path = rds_folder_train, path_val = rds_folder_val, steps_per_epoch = 5, format = "rds", batch_size = 8, epochs = 2) plot(hist)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.