#' Create transformer model
#'
#' Creates transformer network for classification. Model can consist of several stacked attention blocks.
#'
#' @inheritParams keras::layer_multi_head_attention
#' @inheritParams create_model_lstm_cnn
#' @param pos_encoding Either `"sinusoid"` or `"embedding"`. How to add positional information.
#' If `"sinusoid"`, will add sine waves of different frequencies to input.
#' If `"embedding"`, model learns positional embedding.
#' @param embed_dim Dimension for token embedding. No embedding if set to 0. Should be used when input is not one-hot encoded
#' (integer sequence).
#' @param head_size Dimensions of attention key.
#' @param n Frequency of sine waves for positional encoding. Only applied if `pos_encoding = "sinusoid"`.
#' @param ff_dim Units of first dense layer after attention blocks.
#' @param dropout Vector of dropout rates after attention block(s).
#' @param dropout_dense Dropout for dense layers.
#' @param flatten_method How to process output of last attention block. Can be `"max_ch_first"`, `"max_ch_last"`, `"average_ch_first"`,
#' `"average_ch_last"`, `"both_ch_first"`, `"both_ch_last"`, `"all"`, `"none"` or `"flatten"`.
#' If `"average_ch_last"` / `"max_ch_last"` or `"average_ch_first"` / `"max_ch_first"`, will apply global average/max pooling.
#' `_ch_first` / `_ch_last` to decide along which axis. `"both_ch_first"` / `"both_ch_last"` to use max and average together. `"all"` to use all 4
#' global pooling options together. If `"flatten"`, will flatten output after last attention block. If `"none"` no flattening applied.
#' @examples
#'
#' maxlen <- 50
#' \donttest{
#' library(keras)
#' model <- create_model_transformer(maxlen = maxlen,
#' head_size=c(10,12),
#' num_heads=c(7,8),
#' ff_dim=c(5,9),
#' dropout=c(0.3, 0.5))
#' }
#' @returns A keras model implementing transformer architecture.
#' @export
create_model_transformer <- function(maxlen,
vocabulary_size = 4,
embed_dim = 64,
pos_encoding = "embedding",
head_size = 4L,
num_heads = 5L,
ff_dim = 8,
dropout=0,
n = 10000, # pos emb frequency
layer_dense = 2,
dropout_dense = NULL,
flatten_method = "flatten",
last_layer_activation = "softmax",
loss_fn = "categorical_crossentropy",
solver = "adam",
learning_rate = 0.01,
label_noise_matrix = NULL,
bal_acc = FALSE,
f1_metric = FALSE,
auc_metric = FALSE,
label_smoothing = 0,
verbose = TRUE,
model_seed = NULL,
mixed_precision = FALSE,
mirrored_strategy = NULL) {
if (mixed_precision) tensorflow::tf$keras$mixed_precision$set_global_policy("mixed_float16")
if (is.null(mirrored_strategy)) mirrored_strategy <- ifelse(count_gpu() > 1, TRUE, FALSE)
if (mirrored_strategy) {
mirrored_strategy <- tensorflow::tf$distribute$MirroredStrategy()
with(mirrored_strategy$scope(), {
argg <- as.list(environment())
argg$mirrored_strategy <- FALSE
model <- do.call(create_model_transformer, argg)
})
return(model)
}
stopifnot(length(head_size) == length(num_heads))
stopifnot(length(head_size) == length(dropout))
stopifnot(length(head_size) == length(ff_dim))
stopifnot(flatten_method %in% c("max_ch_first", "max_ch_last", "average_ch_first",
"average_ch_last", "both_ch_first", "both_ch_last", "all", "none", "flatten"))
stopifnot(pos_encoding %in% c("sinusoid", "embedding"))
num_dense_layers <- length(layer_dense)
head_size <- as.integer(head_size)
num_heads <- as.integer(num_heads)
maxlen <- as.integer(maxlen)
num_attention_blocks <- length(num_heads)
vocabulary_size <- as.integer(vocabulary_size)
if (!is.null(model_seed)) tensorflow::tf$random$set_seed(model_seed)
if (embed_dim == 0) {
input_tensor <- keras::layer_input(shape = c(maxlen, vocabulary_size))
} else {
input_tensor <- keras::layer_input(shape = c(maxlen))
}
# positional encoding
if (pos_encoding == "sinusoid") {
pos_enc_layer <- layer_pos_sinusoid_wrapper(maxlen = maxlen, vocabulary_size = vocabulary_size,
n = n, embed_dim = embed_dim)
}
if (pos_encoding == "embedding") {
pos_enc_layer <- layer_pos_embedding_wrapper(maxlen = maxlen, vocabulary_size = vocabulary_size,
embed_dim = embed_dim)
}
output_tensor <- input_tensor %>% pos_enc_layer
# attention blocks
for (i in 1:num_attention_blocks) {
attn_block <- layer_transformer_block_wrapper(
num_heads = num_heads[i],
head_size = head_size[i],
dropout_rate = dropout[i],
ff_dim = ff_dim[i],
embed_dim = embed_dim,
vocabulary_size = vocabulary_size,
load_r6 = FALSE)
output_tensor <- output_tensor %>% attn_block
}
if (flatten_method != "none") {
output_tensor <- pooling_flatten(global_pooling = flatten_method, output_tensor = output_tensor)
}
# dense layers
if (num_dense_layers > 1) {
for (i in 1:(num_dense_layers - 1)) {
output_tensor <- output_tensor %>% keras::layer_dense(units = layer_dense[i], activation = "relu")
if (!is.null(dropout_dense)) {
output_tensor <- output_tensor %>% keras::layer_dropout(rate = dropout_dense[i])
}
}
}
output_tensor <- output_tensor %>% keras::layer_dense(units = layer_dense[length(layer_dense)], activation = last_layer_activation, dtype = "float32")
# create model
model <- keras::keras_model(inputs = input_tensor, outputs = output_tensor)
model <- compile_model(model = model, label_smoothing = label_smoothing, layer_dense = layer_dense,
solver = solver, learning_rate = learning_rate, loss_fn = loss_fn,
num_output_layers = 1, label_noise_matrix = label_noise_matrix,
bal_acc = bal_acc, f1_metric = f1_metric, auc_metric = auc_metric)
if (verbose) print(model)
argg <- c(as.list(environment()))
model <- add_hparam_list(model, argg)
reticulate::py_set_attr(x = model, name = "hparam", value = model$hparam)
model
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.