#' This is the companion code to the post
#' "Attention-based Image Captioning with Keras"
#' on the TensorFlow for R blog.
#'
#' https://blogs.rstudio.com/tensorflow/posts/2018-09-17-eager-captioning
library(keras)
use_implementation("tensorflow")
library(tensorflow)
tfe_enable_eager_execution(device_policy = "silent")
np <- import("numpy")
library(tfdatasets)
library(purrr)
library(stringr)
library(glue)
library(rjson)
library(rlang)
library(dplyr)
library(magick)
maybecat <- function(context, x) {
if (debugshapes) {
name <- enexpr(x)
dims <- paste0(dim(x), collapse = " ")
cat(context, ": shape of ", name, ": ", dims, "\n", sep = "")
}
}
debugshapes <- FALSE
restore_checkpoint <- FALSE
saved_features_exist <- FALSE
use_session_with_seed(7777,
disable_gpu = FALSE,
disable_parallel_cpu = FALSE)
annotation_file <- "train2014/annotations/captions_train2014.json"
image_path <- "train2014/train2014"
annotations <- fromJSON(file = annotation_file)
annot_captions <- annotations[[4]]
# 414113
num_captions <- length(annot_captions)
all_captions <- vector(mode = "list", length = num_captions)
all_img_names <- vector(mode = "list", length = num_captions)
for (i in seq_len(num_captions)) {
caption <-
paste0("<start> ", annot_captions[[i]][["caption"]], " <end>")
image_id <- annot_captions[[i]][["image_id"]]
full_coco_image_path <-
sprintf("train2014/train2014/COCO_train2014_%012d.jpg", image_id)
all_img_names[[i]] <- full_coco_image_path
all_captions[[i]] <- caption
}
num_examples <- 30000
if (!saved_features_exist) {
random_sample <- sample(1:num_captions, size = num_examples)
train_indices <-
sample(random_sample, size = length(random_sample) * 0.8)
validation_indices <-
setdiff(random_sample, train_indices)
saveRDS(random_sample,
paste0("random_sample_", num_examples, ".rds"))
saveRDS(train_indices,
paste0("train_indices_", num_examples, ".rds"))
saveRDS(validation_indices,
paste0("validation_indices_", num_examples, ".rds"))
} else {
random_sample <-
readRDS(paste0("random_sample_", num_examples, ".rds"))
train_indices <-
readRDS(paste0("train_indices_", num_examples, ".rds"))
validation_indices <-
readRDS(paste0("validation_indices_", num_examples, ".rds"))
}
sample_captions <- all_captions[random_sample]
sample_images <- all_img_names[random_sample]
train_captions <- all_captions[train_indices]
train_images <- all_img_names[train_indices]
validation_captions <- all_captions[validation_indices]
validation_images <- all_img_names[validation_indices]
load_image <- function(image_path) {
img <- tf$read_file(image_path) %>%
tf$image$decode_jpeg(channels = 3) %>%
tf$image$resize_images(c(299L, 299L)) %>%
tf$keras$applications$inception_v3$preprocess_input()
list(img, image_path)
}
image_model <- application_inception_v3(include_top = FALSE,
weights = "imagenet")
if (!saved_features_exist) {
preencode <- unique(sample_images) %>% unlist() %>% sort()
num_unique <- length(preencode)
batch_size_4save <- 1
image_dataset <- tensor_slices_dataset(preencode) %>%
dataset_map(load_image) %>%
dataset_batch(batch_size_4save)
save_iter <- make_iterator_one_shot(image_dataset)
save_count <- 0
until_out_of_range({
if (save_count %% 100 == 0) {
cat("Saving feature:", save_count, "of", num_unique, "\n")
}
save_count <- save_count + batch_size_4save
batch_4save <- save_iter$get_next()
img <- batch_4save[[1]]
path <- batch_4save[[2]]
batch_features <- image_model(img)
batch_features <- tf$reshape(batch_features,
list(dim(batch_features)[1],-1L, dim(batch_features)[4]))
for (i in 1:dim(batch_features)[1]) {
p <- path[i]$numpy()$decode("utf-8")
np$save(p,
batch_features[i, ,]$numpy())
}
})
}
top_k <- 5000
tokenizer <- text_tokenizer(num_words = top_k,
oov_token = "<unk>",
filters = '!"#$%&()*+.,-/:;=?@[\\]^_`{|}~ ')
tokenizer$fit_on_texts(sample_captions)
train_captions_tokenized <-
tokenizer %>% texts_to_sequences(train_captions)
validation_captions_tokenized <-
tokenizer %>% texts_to_sequences(validation_captions)
tokenizer$word_index
tokenizer$word_index["<unk>"]
tokenizer$word_index["<pad>"] <- 0
tokenizer$word_index["<pad>"]
word_index_df <- data.frame(
word = tokenizer$word_index %>% names(),
index = tokenizer$word_index %>% unlist(use.names = FALSE),
stringsAsFactors = FALSE
)
word_index_df <- word_index_df %>% arrange(index)
decode_caption <- function(text) {
paste(map(text, function(number)
word_index_df %>%
filter(index == number) %>%
select(word) %>%
pull()),
collapse = " ")
}
caption_lengths <-
map(all_captions[1:num_examples], function(c)
str_split(c, " ")[[1]] %>% length()) %>% unlist()
fivenum(caption_lengths)
max_length <- fivenum(caption_lengths)[5]
train_captions_padded <-
pad_sequences(
train_captions_tokenized,
maxlen = max_length,
padding = "post",
truncating = "post"
)
validation_captions_padded <-
pad_sequences(
validation_captions_tokenized,
maxlen = max_length,
padding = "post",
truncating = "post"
)
length(train_images)
dim(train_captions_padded)
batch_size <- 10
buffer_size <- num_examples
embedding_dim <- 256
gru_units <- 512
vocab_size <- top_k
features_shape <- 2048
attention_features_shape <- 64
train_images_4checking <- train_images[c(4, 10, 30)]
train_captions_4checking <- train_captions_padded[c(4, 10, 30),]
validation_images_4checking <- validation_images[c(7, 10, 12)]
validation_captions_4checking <-
validation_captions_padded[c(7, 10, 12),]
map_func <- function(img_name, cap) {
p <- paste0(img_name$decode("utf-8"), ".npy")
img_tensor <- np$load(p)
img_tensor <- tf$cast(img_tensor, tf$float32)
list(img_tensor, cap)
}
train_dataset <-
tensor_slices_dataset(list(train_images, train_captions_padded)) %>%
dataset_map(function(item1, item2)
tf$py_func(map_func, list(item1, item2), list(tf$float32, tf$int32))) %>%
# dataset_shuffle(buffer_size) %>%
dataset_batch(batch_size)
cnn_encoder <-
function(embedding_dim,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$fc <-
layer_dense(units = embedding_dim, activation = "relu")
function(x, mask = NULL) {
# input shape: (batch_size, 64, features_shape)
# shape after fc: (batch_size, 64, embedding_dim)
maybecat("encoder input", x)
x <- self$fc(x)
maybecat("encoder output", x)
x
}
})
}
attention_module <-
function(gru_units,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$W1 = layer_dense(units = gru_units)
self$W2 = layer_dense(units = gru_units)
self$V = layer_dense(units = 1)
function(inputs, mask = NULL) {
features <- inputs[[1]]
hidden <- inputs[[2]]
# features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)
# hidden shape == (batch_size, gru_units)
# hidden_with_time_axis shape == (batch_size, 1, gru_units)
hidden_with_time_axis <- k_expand_dims(hidden, axis = 2)
maybecat("attention module", features)
maybecat("attention module", hidden)
maybecat("attention module", hidden_with_time_axis)
# score shape == (batch_size, 64, 1)
score <-
self$V(k_tanh(self$W1(features) + self$W2(hidden_with_time_axis)))
# attention_weights shape == (batch_size, 64, 1)
attention_weights <- k_softmax(score, axis = 2)
# context_vector shape after sum == (batch_size, embedding_dim)
context_vector <-
k_sum(attention_weights * features, axis = 2)
maybecat("attention module", score)
maybecat("attention module", attention_weights)
maybecat("attention module", context_vector)
list(context_vector, attention_weights)
}
})
}
rnn_decoder <-
function(embedding_dim,
gru_units,
vocab_size,
name = NULL) {
keras_model_custom(name = name, function(self) {
self$gru_units <- gru_units
self$embedding <-
layer_embedding(input_dim = vocab_size, output_dim = embedding_dim)
self$gru <- if (tf$test$is_gpu_available()) {
layer_cudnn_gru(
units = gru_units,
return_sequences = TRUE,
return_state = TRUE,
recurrent_initializer = 'glorot_uniform'
)
} else {
layer_gru(
units = gru_units,
return_sequences = TRUE,
return_state = TRUE,
recurrent_initializer = 'glorot_uniform'
)
}
self$fc1 <- layer_dense(units = self$gru_units)
self$fc2 <- layer_dense(units = vocab_size)
self$attention <- attention_module(self$gru_units)
function(inputs, mask = NULL) {
x <- inputs[[1]]
features <- inputs[[2]]
hidden <- inputs[[3]]
maybecat("decoder", x)
maybecat("decoder", features)
maybecat("decoder", hidden)
c(context_vector, attention_weights) %<-% self$attention(list(features, hidden))
# x shape after passing through embedding == (batch_size, 1, embedding_dim)
x <- self$embedding(x)
maybecat("decoder x after embedding", x)
# x shape after concatenation == (batch_size, 1, 2 * embedding_dim)
x <-
k_concatenate(list(k_expand_dims(context_vector, 2), x))
maybecat("decoder x after concat", x)
# passing the concatenated vector to the GRU
c(output, state) %<-% self$gru(x)
maybecat("decoder output after gru", output)
maybecat("decoder state after gru", state)
# shape == (batch_size, 1, gru_units)
x <- self$fc1(output)
maybecat("decoder output after fc1", x)
# x shape == (batch_size, gru_units)
x <- k_reshape(x, c(-1, dim(x)[[3]]))
maybecat("decoder output after reshape", x)
# output shape == (batch_size, vocab_size)
x <- self$fc2(x)
maybecat("decoder output after fc2", x)
list(x, state, attention_weights)
}
})
}
encoder <- cnn_encoder(embedding_dim)
decoder <- rnn_decoder(embedding_dim, gru_units, vocab_size)
optimizer = tf$train$AdamOptimizer()
cx_loss <- function(y_true, y_pred) {
mask <- 1 - k_cast(y_true == 0L, dtype = "float32")
loss <-
tf$nn$sparse_softmax_cross_entropy_with_logits(labels = y_true, logits =
y_pred) * mask
tf$reduce_mean(loss)
}
get_caption <-
function(image) {
attention_matrix <-
matrix(0, nrow = max_length, ncol = attention_features_shape)
# shape=(1, 299, 299, 3)
temp_input <- k_expand_dims(load_image(image)[[1]], 1)
# shape=(1, 8, 8, 2048),
img_tensor_val <- image_model(temp_input)
# shape=(1, 64, 2048)
img_tensor_val <- k_reshape(img_tensor_val,
list(dim(img_tensor_val)[1],-1, dim(img_tensor_val)[4]))
# shape=(1, 64, 256)
features <- encoder(img_tensor_val)
dec_hidden <- k_zeros(c(1, gru_units))
dec_input <-
k_expand_dims(list(word_index_df[word_index_df$word == "<start>", "index"]))
result <- ""
for (t in seq_len(max_length - 1)) {
c(preds, dec_hidden, attention_weights) %<-%
decoder(list(dec_input, features, dec_hidden))
attention_weights <- k_reshape(attention_weights, c(-1))
attention_matrix[t, ] <- attention_weights %>% as.double()
pred_idx = tf$multinomial(exp(preds), num_samples = 1)[1, 1] %>% as.double()
pred_word <-
word_index_df[word_index_df$index == pred_idx, "word"]
if (pred_word == "<end>") {
result <-
paste(result, pred_word)
attention_matrix <-
attention_matrix[1:length(str_split(result, " ")[[1]]), , drop = FALSE]
return (list(str_trim(result), attention_matrix))
} else {
result <-
paste(result, pred_word)
dec_input <- k_expand_dims(list(pred_idx))
}
}
list(str_trim(result), attention_matrix)
}
plot_attention <-
function(attention_matrix,
image_name,
result,
epoch) {
image <-
image_read(image_name) %>% image_scale("299x299!")
result <- str_split(result, " ")[[1]] %>% as.list()
# attention_matrix shape: nrow = max_length, ncol = attention_features_shape
for (i in 1:length(result)) {
att <- attention_matrix[i, ] %>% np$resize(tuple(8L, 8L))
dim(att) <- c(8, 8, 1)
att <- image_read(att) %>% image_scale("299x299") %>%
image_annotate(
result[[i]],
gravity = "northeast",
size = 20,
color = "white",
location = "+20+40"
)
overlay <-
image_composite(att, image, operator = "blend", compose_args = "30")
image_write(
overlay,
paste0(
"attention_plot_epoch_",
epoch,
"_img_",
image_name %>% basename() %>% str_sub(16,-5),
"_word_",
i,
".png"
)
)
}
}
check_sample_captions <-
function(epoch, mode, plot_attention) {
images <- switch(mode,
training = train_images_4checking,
validation = validation_images_4checking)
captions <- switch(mode,
training = train_captions_4checking,
validation = validation_captions_4checking)
cat("\n", "Sample checks on ", mode, " set:", "\n", sep = "")
for (i in 1:length(images)) {
c(result, attention_matrix) %<-% get_caption(images[[i]])
real_caption <-
decode_caption(captions[i,]) %>% str_remove_all(" <pad>")
cat("\nReal caption:", real_caption, "\n")
cat("\nPredicted caption:", result, "\n")
if (plot_attention)
plot_attention(attention_matrix, images[[i]], result, epoch)
}
}
checkpoint_dir <- "./checkpoints_captions"
checkpoint_prefix <- file.path(checkpoint_dir, "ckpt")
checkpoint <-
tf$train$Checkpoint(optimizer = optimizer,
encoder = encoder,
decoder = decoder)
if (restore_checkpoint) {
checkpoint$restore(tf$train$latest_checkpoint(checkpoint_dir))
}
num_epochs <- 20
if (!restore_checkpoint) {
for (epoch in seq_len(num_epochs)) {
cat("Starting epoch:", epoch, "\n")
total_loss <- 0
progress <- 0
train_iter <- make_iterator_one_shot(train_dataset)
until_out_of_range({
progress <- progress + 1
if (progress %% 10 == 0)
cat("-")
batch <- iterator_get_next(train_iter)
loss <- 0
img_tensor <- batch[[1]]
target_caption <- batch[[2]]
dec_hidden <- k_zeros(c(batch_size, gru_units))
dec_input <-
k_expand_dims(rep(list(word_index_df[word_index_df$word == "<start>", "index"]), batch_size))
with(tf$GradientTape() %as% tape, {
features <- encoder(img_tensor)
for (t in seq_len(dim(target_caption)[2] - 1)) {
c(preds, dec_hidden, weights) %<-%
decoder(list(dec_input, features, dec_hidden))
loss <- loss + cx_loss(target_caption[, t], preds)
dec_input <- k_expand_dims(target_caption[, t])
}
})
total_loss <-
total_loss + loss / k_cast_to_floatx(dim(target_caption)[2])
variables <- c(encoder$variables, decoder$variables)
gradients <- tape$gradient(loss, variables)
optimizer$apply_gradients(purrr::transpose(list(gradients, variables)),
global_step = tf$train$get_or_create_global_step())
})
cat(paste0(
"\n\nTotal loss (epoch): ",
epoch,
": ",
(total_loss / k_cast_to_floatx(buffer_size)) %>% as.double() %>% round(4),
"\n"
))
checkpoint$save(file_prefix = checkpoint_prefix)
check_sample_captions(epoch, "training", plot_attention = FALSE)
check_sample_captions(epoch, "validation", plot_attention = FALSE)
}
}
epoch <- num_epochs
check_sample_captions(epoch, "training", plot_attention = TRUE)
check_sample_captions(epoch, "validation", plot_attention = TRUE)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.